一,项目简介
基于Python实现豆瓣电影数据的抓去,并存入本在数据库。
数据库结构准备:
create table if not exists `categories` ( `id` int(11) NOT NULL PRIMARY KEY, `type` varchar (255) NOT NULL DEFAULT '' ) ENGINE=InnoDB DEFAULT CHARSET=utf8; create table if not exists `movies`( `id` int(11) NOT NULL PRIMARY KEY AUTO_INCREMENT, `cover` varchar (255) NOT NULL DEFAULT '', `title` varchar (50) NOT NULL DEFAULT '', `date` varchar (10) NOT NULL DEFAULT '', `rate` float DEFAULT 0, `director` varchar (100) NOT NULL DEFAULT '', `scriptwriter` varchar(100) NOT NULL DEFAULT '', `actors` text, `district` varchar(255) DEFAULT '', `language` varchar (255) DEFAULT '', `duration` varchar (100) DEFAULT '', `abs` text, UNIQUE (`title`) )ENGINE=InnoDB DEFAULT CHARSET=utf8; create table if not exists `movie-category` ( `id` BIGINT NOT NULL PRIMARY KEY AUTO_INCREMENT, `mid` int(11) NOT NULL, `cid` int(11) NOT NULL, KEY `fk_on_movie_id` (`mid`), CONSTRAINT `fk_on_movie_id` FOREIGN KEY (`mid`) REFERENCES `movies` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, KEY `fk_on_category_id` (`cid`), CONSTRAINT `fk_on_category_id` FOREIGN KEY (`cid`) REFERENCES `categories` (`id`) ON DELETE CASCADE ON UPDATE CASCADE )ENGINE=InnoDB DEFAULT CHARSET=utf8;
INSERT INTO `categories` VALUES (1,'剧情'); INSERT INTO `categories` VALUES (2,'喜剧'); INSERT INTO `categories` VALUES (3,'动作'); INSERT INTO `categories` VALUES (4,'爱情'); INSERT INTO `categories` VALUES (5,'科幻'); INSERT INTO `categories` VALUES (6,'动画'); INSERT INTO `categories` VALUES (7,'悬疑'); INSERT INTO `categories` VALUES (8,'惊悚'); INSERT INTO `categories` VALUES (9,'恐怖'); INSERT INTO `categories` VALUES (10,'犯罪'); INSERT INTO `categories` VALUES (11,'同性'); INSERT INTO `categories` VALUES (12,'音乐'); INSERT INTO `categories` VALUES (13,'歌舞'); INSERT INTO `categories` VALUES (14,'传记'); INSERT INTO `categories` VALUES (15,'历史'); INSERT INTO `categories` VALUES (16,'战争'); INSERT INTO `categories` VALUES (17,'西部'); INSERT INTO `categories` VALUES (18,'奇幻'); INSERT INTO `categories` VALUES (19,'冒险'); INSERT INTO `categories` VALUES (20,'灾难'); INSERT INTO `categories` VALUES (21,'武侠'); INSERT INTO `categories` VALUES (22,'情色');
二,环境介绍
语言环境:Python3.7+scrapy
数据库:Mysql: mysql5.7
开发工具:IDEA或eclipse
三,核心代码展示
数据模型:items.py
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #电影标题 title = scrapy.Field() #导演 director = scrapy.Field() #编剧 scriptwriter = scrapy.Field() #演员 actors = scrapy.Field() #上映日期 date = scrapy.Field() #评分 rate = scrapy.Field() #国家/地区 district = scrapy.Field() #语言 language = scrapy.Field() #封面图片 cover = scrapy.Field() #简介 abs = scrapy.Field() #类型 categories = scrapy.Field() #时长 duration = scrapy.Field()
数据存储工具定义:pipelines.py
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface from scrapy.exceptions import DropItem from scrapy.http import Request from scrapy.pipelines.images import ImagesPipeline import pymysql import random class DoubanPipeline: def process_item(self, item, spider): return item #根据取得的图片url重新请求,下载图片到本地 class DownloadImagePipeline(ImagesPipeline): default_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36", #"Cookie":'_vwo_uuid_v2=D65EBF690D9454DE4C13354E37DC5B9AA|3bb7e6e65f20e31141b871b4fea88dc2; __yadk_uid=QBp8bLKHjCn5zS2J5r8xV7327R0wnqkU; douban-fav-remind=1; gr_user_id=0a41d8d1-fe39-4619-827a-17961cf31795; viewed="35013197_10769749_23008813_26282806_34912177_22139960_35003794_30249691_26616244_27035127"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.21320; bid=gplG4aEN4Xc; ll="108288"; ap_v=0,6.0; __utma=30149280.819011260.1572087992.1604448803.1604453561.105; __utmc=30149280; __utmz=30149280.1604453561.105.65.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gads=ID=eddb65558a1da756-223ab4f88bc400c8:T=1604453562:RT=1604453562:S=ALNI_MZGB_I69qmiL2tt3lm57JVX1i4r2w; __utmb=30149280.4.10.1604453561; dbcl2="213202515:Ip9mjwUAab4"; ck=wxUS; __utma=223695111.897479705.1572088003.1604448803.1604455298.71; __utmb=223695111.0.10.1604455298; __utmc=223695111; __utmz=223695111.1604455298.71.42.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1604455298%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _pk_id.100001.4cf6=e11874c5506d4ab1.1572088003.71.1604455342.1604450364.' } def get_media_requests(self, item, info): #print('到这里来了...') image_url = item['cover'] yield Request( image_url, headers=self.default_headers) #get_media_requests函数返回后执行 def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") #返回的图片地址是full+文件名的格式,由于我是边爬边下载,所以每次只有一张图片,但是返回的是 #数组,函数设计为多张图片,我将‘full’替换成了自己后台接口的地址,方便数据库中的存储 image_paths = str(image_paths[0]).replace('full','http://localhost:8443/api/file') item['cover'] = image_paths return item # 将电影信息存入到数据库中 class DBPipeline(object): def __init__(self): # connection database # 后面三个依次是数据库连接名、数据库密码、数据库名称 self.connect = pymysql.connect(host='127.0.0.1', user='root', password='root', db='fivesix',charset='utf8',port=3306) # get cursor self.cursor_1 = self.connect.cursor() self.cursor_2 = self.connect.cursor() self.type_to_id = { '剧情': 1,'喜剧':2, '动作':3, '爱情': 4, '科幻':5, '动画':6, '悬疑': 7, '惊悚' : 8, '恐怖' : 9, '犯罪': 10, '同性':11, '音乐':12, '歌舞':13, '传记':14,'历史':15, '战争':16, '西部':17, '奇幻':18, '冒险':19, '灾难':20,'武侠':21, '情色':22 } print("连接数据库成功") def process_item(self, item, spider): if item['title'] == '': return # sql语句 insert_movie_sql = """ insert ignore into `movies`(cover,title, director, scriptwriter, actors, district,rate,date,language,duration,abs) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ insert_mc_sql = """ insert into `movie-category` (mid,cid) values (%s,%s) """ # 执行插入数据到数据库操作 self.cursor_1.execute(insert_movie_sql, (item['cover'], item['title'], item['director'], item['scriptwriter'], item['actors'],item['district'],item['rate'], item['date'],item['language'],item['duration'],item['abs'])) mid = self.cursor_1.lastrowid #处理标签 cids = [] categories = item['categories'].split('/') for c in categories: if c not in self.type_to_id.keys():continue cids.append(self.type_to_id.get(c)) #插入关联表 print(cids) for cid in cids: self.cursor_2.execute(insert_mc_sql,(mid,cid)) # 提交,不进行提交无法保存到数据库 self.connect.commit() def close_spider(self, spider): # 关闭游标和连接 self.cursor_1.close() self.cursor_2.close() self.connect.close()
爬虫核心代码:movies.py
# -*- coding: utf-8 -*- import scrapy import json import re import time from douban.items import DoubanItem from fake_useragent import UserAgent import random class MovieHotSpider(scrapy.Spider): #爬虫的名称,在命令行可以方便的运行爬虫 name = "movie_hot" allowed_domains = ["movie.douban.com"] #pro = ['139.224.37.83','115.223.7.110','221.122.91.75'] # 拼接豆瓣电影URL BASE_URL = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%s&sort=recommend&page_limit=%s&page_start=%s' MOVIE_TAG = '华语' PAGE_LIMIT = 20 page_start = 0 domains = BASE_URL % (MOVIE_TAG, PAGE_LIMIT, page_start) #伪装浏览器 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36" #,"Cookie":'_vwo_uuid_v2=D65EBF690D9454DE4C13354E37DC5B9AA|3bb7e6e65f20e31141b871b4fea88dc2; __yadk_uid=QBp8bLKHjCn5zS2J5r8xV7327R0wnqkU; douban-fav-remind=1; gr_user_id=0a41d8d1-fe39-4619-827a-17961cf31795; viewed="35013197_10769749_23008813_26282806_34912177_22139960_35003794_30249691_26616244_27035127"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.21320; bid=gplG4aEN4Xc; ll="108288"; ap_v=0,6.0; __utma=30149280.819011260.1572087992.1604448803.1604453561.105; __utmc=30149280; __utmz=30149280.1604453561.105.65.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gads=ID=eddb65558a1da756-223ab4f88bc400c8:T=1604453562:RT=1604453562:S=ALNI_MZGB_I69qmiL2tt3lm57JVX1i4r2w; __utmb=30149280.4.10.1604453561; dbcl2="213202515:Ip9mjwUAab4"; ck=wxUS; __utma=223695111.897479705.1572088003.1604448803.1604455298.71; __utmb=223695111.0.10.1604455298; __utmc=223695111; __utmz=223695111.1604455298.71.42.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1604455298%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _pk_id.100001.4cf6=e11874c5506d4ab1.1572088003.71.1604455342.1604450364.' } #总共爬取的页数 pages = 100 # 爬虫从此开始 def start_requests(self): print('~~~~爬取列表: '+ self.domains) yield scrapy.Request( url = self.domains, headers=self.headers, callback=self.request_movies ) # 分析列表页 def request_movies(self, response): infos = response.text # 使用JSON模块解析响应结果 infos = json.loads(infos) # 迭代影片信息列表 for movie_info in infos['subjects']: print('~~~爬取电影: ' + movie_info['title'] + '/'+ movie_info['rate']) # 提取影片页面url,构造Request发送请求,并将item通过meta参数传递给影片页面解析函数 yield scrapy.Request( url = str(movie_info['url']), headers = self.headers, callback = self.request_movie, dont_filter=True ) #如果已经爬完pages或者当前标签下没有更多电影时退出 if self.pages > 0 and len(infos['subjects']) == self.PAGE_LIMIT: self.pages -= 1 self.page_start += self.PAGE_LIMIT url = self.BASE_URL % (self.MOVIE_TAG,self.PAGE_LIMIT,self.page_start) time.sleep(5) print('-----爬取列表: ' + url) yield scrapy.Request( url=url, headers=self.headers, callback=self.request_movies, dont_filter=True ) # 分析详情页 def request_movie(self, response): #组装数据 movie_item = DoubanItem() title = response.css('div#content>h1>span:nth-child(1)::text').extract_first() t = re.findall('[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5_0-9]', title) #获取非info区域数据 movie_item['title'] = ''.join(t) movie_item['date'] = response.css('div#content>h1>span.year::text').extract_first()[1:-1] movie_item['rate'] = response.css('strong.rating_num::text').extract_first() #movie_item['commentCount'] = response.css('div.rating_sum>a.rating_people>span::text').extract_first() #movie_item['start'] = '/'.join(response.css('span.rating_per::text').extract()) #movie_item['better'] = '/'.join(response.css('div.rating_betterthan>a::text').extract()) movie_item['abs'] = response.css('#link-report>span::text').extract_first().strip() movie_item['cover'] = response.css('#mainpic>a>img::attr(src)').extract_first() # 获取整个信息字符串 info = response.css('div.subject div#info').xpath('string(.)').extract_first() # 提取所以字段名 fields = [s.strip().replace(':', '') for s in response.css('div#info span.pl::text').extract()] # 提取所有字段的值 values = [re.sub('\s+', '', s.strip()) for s in re.split('\s*(?:%s):\s*' % '|'.join(fields), info)][1:] # 处理列名称 for i in range(len(fields)): if '导演' == fields[i]: fields[i] = 'director' if '编剧' == fields[i]: fields[i] = 'scriptwriter' if '主演' == fields[i]: fields[i] = 'actors' if '类型' == fields[i]: fields[i] = 'categories' if '制片国家/地区' == fields[i]: fields[i] = 'district' if '语言' == fields[i]: fields[i] = 'language' if '片长' == fields[i]: fields[i] = 'duration' # 将所有信息填入item other_info = list(zip(fields,values)) for field,value in other_info: if field in ['IMDb链接','上映日期','官方网站','又名']: other_info.remove((field,value)) final_info = dict(other_info[:-1]) movie_item.update(final_info) # 处理缺失字段 if not 'director' in movie_item.keys(): movie_item['director'] = '/' if not 'scriptwriter' in movie_item.keys(): movie_item['scriptwriter'] = '/' if not 'actors' in movie_item.keys(): movie_item['actors'] = '/' if not 'categories' in movie_item.keys(): movie_item['categories'] = '/' if not 'district' in movie_item.keys(): movie_item['district'] = '/' if not 'language' in movie_item.keys(): movie_item['language'] = '/' if not 'duration' in movie_item.keys(): movie_item['duration'] = '/' print('~完成爬取电影: ' + movie_item['title'] + '/' + movie_item['rate']) #将数据加入到字典中 yield movie_item
四,项目总结
爬取的数据最终会存到MYSQL服务器的表中,可以写程序将数据展示出来。注意的时会对IP进行限制封号,200条为限,超过IP会被限制,可以换一个IP进行抓去。主要研究爬虫的基本使用规范和语法,相对较为简单,供大家学习参考