scrapy爬取豆瓣Top250
1.创建scrapy项目
- win+r在窗口中输入cmd
- 在小黑窗口输入:scrapy startproject <项目名>
这样代表创建成功!- cd dbtop/dbtop/spiders进入到spiders目录下输入:scrapy genspider db movie.douban.com/top250
- 使用PyCharm编辑器打开上面创建好的scrapy项目,打开dbtop/dbtop/spiders目录下的db.py文件
在db.py编写爬虫代码- 在爬取豆瓣Top250数据前,打开settings.py文件将ROBOTSTXT_OBEY = True改行代码注释
- settings.py文件中(DOWNLOADER_MIDDLEWARES,ITEM_PIPELINES)关闭注释
- 将settings.py文件中
DOWNLOADER_MIDDLEWARES= { 'dbtop.middlewares.DbtopDownloaderMiddleware': 543, } ITEM_PIPELINES= { 'dbtop.pipelines.DbtopPipeline': 300, } USER_AGENT='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1643.42'关闭注释
2.编写代码
- 打开items.py文件
importscrapyclassScrapyDbtop250Item(scrapy.Item): # 定义以下变量# define the fields for your item here like:# name = scrapy.Field()# 排名ord=scrapy.Field() # 电影名称name=scrapy.Field() # 导演director=scrapy.Field() # 编剧scriptwriters=scrapy.Field() # 主演star=scrapy.Field() # 类型type=scrapy.Field() # 地区region=scrapy.Field() # 语言language=scrapy.Field() # 上映时间releaseDate=scrapy.Field() # 片长length=scrapy.Field() # 简介synopsis=scrapy.Field() # 图片地址img=scrapy.Field()
- 打开db.py文件
importscrapyfromscrapy.linkextractorsimportLinkExtractorfromscrapy.spidersimportCrawlSpider, Rulefromscrapy_dbtop250_02.itemsimportScrapyDbtop25002ItemclassDbSpider(CrawlSpider): name='db'allowed_domains= ['movie.douban.com'] start_urls= ['https://movie.douban.com/top250?start=0&filter='] rules= ( Rule(LinkExtractor(allow=r'\?start=\d+&filter='), callback='parse_item', follow=True), ) defparse_item(self, response): videos_a=response.xpath('//div[@class="hd"]/a/@href') forv_ainvideos_a: url=v_a.extract() # 对链接发起访问yieldscrapy.Request(url=str(url), callback=self.parse_second) # 去除空格与\ndefdislodge_blank(self, src): price= [sy.strip() forsyinsrcifsy.strip() !=''] returnpricedefparse_second(self, response): # 排名ord=response.xpath('//div[@id="content"]/div[@class="top250"]/span[1]/text()').extract()[0] # # 电影名称name=response.xpath('//div[@id="content"]/h1/span')[0].xpath('./text()').extract()[0] iflen(name) ==0: name='无电影名'# 导演director=response.xpath('//div[@id="info"]/span')[0].xpath('./span[@class="attrs"]/a/text()').extract()[0] iflen(director) ==0: director='无导演'# 编剧try: response.xpath('//div[@id="info"]/span')[1].xpath('./span[@class="attrs"]/a/text()').extract()[0] exceptException: scriptwriters='无编剧信息'else: scriptwriters= \ response.xpath('//div[@id="info"]/span')[1].xpath('./span[@class="attrs"]/a/text()').extract()[0] # 主演star='/'.join(response.xpath('//span[@class="actor"]//a/text()').extract()) iflen(star) ==0: star='无主演'# 类型type='/'.join(response.xpath('//div[@id="info"]/span[@property="v:genre"]/text()').extract()) # 地区region=response.xpath('//span[contains(./text(), "制片国家/地区:")]/following::text()[1]').extract()[0] iflen(region) ==0: region='无地区'# 语言language=response.xpath('//span[contains(./text(), "语言:")]/following::text()[1]').extract()[0] iflen(language) ==0: language='无语言'# 上映时间releaseDate=self.dislodge_blank( src=response.xpath('//div[@id="info"]//span[@property="v:initialReleaseDate"]/@content').extract())[0] # 片上length=self.dislodge_blank( src=response.xpath('//div[@id="info"]//span[@property="v:runtime"]/@content').extract())[0] +"分钟"# 简介synopsis=''.join(response.xpath('//div[@id="link-report"]/span/text()').extract()).replace(u'\u3000', u'').replace('\n', '').replace( '\r', '').replace(" ", "").replace('"', "'") # 图片地址img=response.xpath('//div[@id="mainpic"]/a/img/@src').extract()[0] data=ScrapyDbtop25002Item(ord=ord, name=name, director=director, scriptwriters=scriptwriters, star=star, type=type, region=region, language=language, releaseDate=releaseDate, length=length, synopsis=synopsis, img=img) yielddata
- 打开pipelines.py文件
fromitemadapterimportItemAdapterclassScrapyDbtop25002Pipeline: # 加上这两个方法是为了避免文件重复的打开# 在爬虫文件开始的之前就执行的方法defopen_spider(self, spider): self.fd=open('book.json', 'w', encoding='utf-8') # 将item就是yield后面的book对象defprocess_item(self, item, spider): self.fd.write(str(item)) returnitem# 在爬虫文件执行完成后执行的方法defclose_spider(self, spider): self.fd.close() # 加载settings文件fromscrapy.utils.projectimportget_project_settingsimportpymysqlclassMysqlPipelines: # 开启链接defopen_spider(self, spider): settings=get_project_settings() self.host=settings['DB_HOST'] self.user=settings['DB_USER'] self.port=settings['DB_PORT'] self.passwrod=settings['DB_PASSWROD'] self.name=settings['DB_NAME'] self.charset=settings['DB_CHARSET'] self.connect() defconnect(self): self.conn=pymysql.connect( host=self.host, port=self.port, user=self.user, password=self.passwrod, db=self.name, charset=self.charset ) self.cursor=self.conn.cursor() # 执行sql语句defprocess_item(self, item, spider): # sql语句sql='insert into dbtop250(ord,name,director,scriptwriters,star,type,region,language,releaseDate,length,synopsis,img) ' \ 'values ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}","{}");'.format( item["ord"], item["name"], item["director"], item["scriptwriters"], item["star"], item["type"], item["region"], item["language"], item["releaseDate"], item["length"], item["synopsis"], item["img"]) print(sql) # 执行sql语句self.cursor.execute(sql) # 提交self.conn.commit() # 关闭连接defclose_spider(self, spider): self.cursor.close() self.conn.close() importosimportcsv# 导出csvclassExcelPipelines: def__init__(self): # csv文件的位置,无需事先创建store_file=os.path.dirname(__file__) +'/spiders/csdn.csv'# 打开(创建)文件self.file=open(store_file, 'w') # csv写法self.writer=csv.writer(self.file, dialect="excel") self.writer.writerow( ["排名", "电影名称", "导演", "编剧", "主演", "类型", "地区", "语言", "上映时间", "片长", "简介", "图片地址"]) defprocess_item(self, item, spider): self.writer=csv.writer(self.file, dialect="excel") line= [item['ord'].encode('utf8', 'ignore'), item['name'].encode('utf8', 'ignore'), item['director'].encode('utf8', 'ignore'), item['scriptwriters'].encode('utf8', 'ignore'), item['star'].encode('utf8', 'ignore'), item['type'].encode('utf8', 'ignore'), item['region'].encode('utf8', 'ignore'), item['language'].encode('utf8', 'ignore'), item['releaseDate'].encode('utf8', 'ignore'), item['length'].encode('utf8', 'ignore'), item['synopsis'].encode('utf8', 'ignore'), item['img'].encode('utf8', 'ignore')] self.writer.writerow(line) returnitemdefspider_closed(self, spider): self.file.close()
在settings.py文件中修改(ITEM_PIPELINES),添加(DB_HOST,DB_PORT,DB_USER,DB_PASSWROD,DB_NAME,DB_CHARSET)
ITEM_PIPELINES= { 'scrapy_dbtop.pipelines.DbtopItem': 300, # 数据库'scrapy_dbtop.pipelines.MysqlPipelines': 301, # csv'scrapy_dbtop.pipelines.ExcelPipelines': 302, } DB_HOST='127.0.0.1'DB_PORT=端口号DB_USER='mysql账号'DB_PASSWROD='mysql密码'DB_NAME='reptilebank'DB_CHARSET='UTF8'
3.创建数据库
执行一下代码
DROPTABLE IF EXISTS `dbtop250`;CREATETABLE `dbtop250` ( `id` intNOTNULL AUTO_INCREMENT, `ord` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `director` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `scriptwriters` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `star` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL, `type` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `region` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `language` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `releaseDate` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `length` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, `synopsis` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL, `img` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL, PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET= utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;SET FOREIGN_KEY_CHECKS =1;
4.启动scrapy项目
- win+r在窗口中输入cmd cd进入到dbtop项目spiders目录下输入:'scrapy crawl db'
5.查看运行结果
💕💕💕💕💕蟹蟹 !!!