本片代码亮点在于使用BeautifulSoup的select功能,可以直接根据数据在html页面中的层级标签来获取数据。
# -- coding=gb18030 --
author = 'vincent'
import sys
import urllib2
import urllib
import cookielib
from bs4 import BeautifulSoup
class Spider66ys:
headers = None
home_url = None
def init(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:50.0) Gecko/20100101 Firefox/50.0'
}//代码效果参考:http://www.ezhiqi.com/zx/art_7500.html
self.home_url = ""
# 获取网页信息
def get_html(self, url):
print "正在获取网页【", url, "】的信息..."
if len(url) == 0:
print "Input url is null!"
sys.exit(0)
request = urllib2.Request(url, headers=self.headers)
response = urllib2.urlopen(request)
html = response.read()
# print "获取首页信息(", url, ")完毕."
return html
# 在电影页面下获取电影的下载链接
def get_download_url(self, film):
print "开始从网页【", film【0】, "】中获取电影【", film【1】, "】的下载链接..."
html = self.get_html(film【0】)
# fp = open("film.html", "w")
# fp.write(html)
# fp.close()
soup = BeautifulSoup(html, "lxml", from_encoding="gb18030")
# print soup.prettify()
results = soup.select("html > body > div.wrap > div.mainleft \
> div.contentinfo > div#text > table > tbody > tr > td > a")
for result in results:
film.append(result【'href'】)
# 获取最新更新电影
def get_new_update(self):
new_film_list = 【】
print "正在获取【", self.home_url, "】更新电影..."
html = self.get_html(self.home_url)
# fp = open("66ys.html", "w")
# fp.write(html)
# fp.close()
soup = BeautifulSoup(html, "lxml", from_encoding="gb18030")
results = soup.select("html > body > div.wrap > div.tnlist > ul > li > a")
for result in results:
film = 【】
film.append(result【'href'】)
film.append(result.getText().encode('gb18030').strip())
self.get_download_url(film)
new_film_list.append(film)
return new_film_list
# 根据关键字在66影视上搜索电影
def search_film(self, content):
search_film_list = 【】
search_url = self.home_url + "/e/search/index.php"
print "开始搜索电影【", content, "】..."
# print search_url
postDict = {
"keyboard": content,
"show": "title,smalltext",
"submit": "",
"tbname": "Article",
"tempid": "1"
}//代码效果参考:http://www.ezhiqi.com/bx/art_7199.html
postData = urllib.urlencode(postDict)
# print postData
cookie_jar = cookielib.LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
urllib2.install_opener(opener)
request = urllib2.Request(search_url, postData, headers=self.headers)
response = urllib2.urlopen(request)
opener.open(request)
html = response.read()
# fp = open("search.html", "w")
# fp.write(html)
# fp.close()
# print content
soup = BeautifulSoup(html, "lxml", from_encoding="gb18030")
results = soup.select("html > body > table.tableborder > tr > td > div > b")
if len(results) == 1:
print "没有搜索到相关的内容"
return None
results = soup.select("html > body > div > div.wrap > div.mainleft > div.channellist > div.listBox > ul > li \
div.listInfo > h3 > a")
# print results
for result in results:
film = 【】
film.append(result【'href'】)
film.append(result.getText().encode('gb18030').strip())
self.get_download_url(film)
search_film_list.append(film)
print "共搜索到【", len(results), "】部电影。"
return search_film_list
if name == "main":
spider = Spider66ys()
# new_film_list = spider.get_new_update()
# for film in new_film_list:
# for info in film:
# print info, "\t"
# print ""
content = "冰与火之歌"
search_film_list = spider.search_film(content)
for film in search_film_list:
print film【1】, ":"
for info in film【2:】:
print info
print "-"*200
心有猛虎,细嗅蔷薇