下载VOA英语网站的相关内容,脚本如下

要用到 Nokogiri 库。此库相关使用介绍


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/ruby
require  'open-uri'
require  'nokogiri'
www =  'http://www.51voa.com'
pagelist = []
doc = Nokogiri:: HTML (open(www +  '/Learn_A_Word_1.html' ))
doc.css( 'div#pagelist a' ). each {|x| pagelist << x[ 'href' ]}
def  get_child_page(address)
   list = []
   doc = Nokogiri:: HTML (open(address))
   doc.css( 'div#list a' ). each {|x| list << x[ 'href' ]}
   list
end
def  download(mp3,file)
File .open(file +  ".mp3" , 'wb' ){ |f| f.write(open(mp3).read) }
end
def  writefile(txt,file)
   aFile =  File . new (file +  ".txt" , "w" )
   aFile.puts txt
   aFile.close
end
pagelist. each  do  |address|
txt, mp3, name =  ''
    list = get_child_page(www +  "/"  + address)
    list. each  do  |result|
        doc = Nokogiri:: HTML (open(www + result))
        txt = doc.css( 'div#content' ).text #file txt
        name = doc.css( 'div#title' ).text.strip.gsub( ' ' , '_' )
        #name = doc.css('div#title').text.encode("GBK") 如果想保存中文名,就用String#encode方法
        begin
        mp3 = doc.css( 'div#menubar a' )[ 0 ][ 'href' ] #mp3
        download(mp3,name)
        writefile(txt,name)
        rescue
        end
    end
end