代码没含量,希望帮到入门的小白。
import requests
import re,json
from lxml import etree
import csv
class Spider():
def open_csv(self):
'''
在CSV文件的开头写一行标题
:return:
'''
with open('data.csv', 'a', newline='') as f:
spamwriter = csv.writer(f)
spamwriter.writerow(['title', 'star', 'date', 'score'])
def __get_page(self,url,headers):
'''
获取文本内容
:param url:
:param headers:
:return:
'''
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except Exception:
return None
def __parse_page(self,html):
'''
解析HTML,并得到提取的数据
:param html:
:return:
'''
data = etree.HTML(html)
results = data.xpath('//*[@class="board-wrapper"]/dd/div/div')
for result in results:
# 电影名称 电影主演 电影上映日期 评分
ws = [
result.xpath('./div[1]/p[1]/a/text()')[0],
result.xpath('./div[1]/p[2]/text()')[0].strip(),
result.xpath('./div[1]/p[3]/text()')[0],
result.xpath('./div[2]/p/i[1]/text()')[0] + result.xpath('./div[2]/p/i[2]/text()')[0],
]
#保存到CSV
with open('data.csv','a',newline='') as f:
writer = csv.writer(f)
writer.writerow(ws)
def run(self):
'''
程序运行入口
:return:
'''
self.open_csv()
for i in range(11):
url ='http://maoyan.com/board/4?offset={}'.format(10*i)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'
' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
html = self.__get_page(url,headers)
self.__parse_page(html)
#实例化类
spider = Spider()
spider.run()