京东图书评论有非常丰富的信息,这里面就包含了购买日期、书名、作者、好评、中评、差评等等。以购买日期为例,使用Python + Mysql的搭配进行实现,程序不大,才100行。相关的解释我都在程序里加注了:
- from selenium import webdriver
- from bs4 import BeautifulSoup
- import re
- import win32com.client
- import threading,time
- import MySQLdb
-
- def mydebug():
- driver.quit()
- exit(0)
-
- def catchDate(s):
- """页面数据提取"""
- soup = BeautifulSoup(s)
- z = []
- global nowtimes
-
- m = soup.findAll("div",class_="date-buy")
- for obj in m:
- try:
- tmp = obj.find('br').contents
- except Exception, e:
- continue
- if(tmp != ""):
- z.append(tmp)
- nowtimes += 1
- return z
-
- def getTimes(n,t):
- """获取当前进度"""
- return "当前进度为:" + str(int(100*n/t)) + "%"
-
-
- #———————————————————————————————————| 程序开始 |—————————————————————————————————
- #确定图书大类
- cate = {"3273":"历史","3279":"心理学","3276":"政治军事","3275":"国学古籍","3274":"哲学宗教","3277":"法律","3280":"文化","3281":"社会科学"}
-
- #断点续抓
- num1 = input("bookid:")
- num2 = input("pagenumber:")
-
- #生成图书大类链接,共需17355*20 = 347100次
- totaltimes = 347100.0
- nowtimes = 0
-
- #开启webdirver的PhantomJS对象
- #driver = webdriver.PhantomJS()
- driver = webdriver.Ie('C:\Python27\Scripts\IEDriverServer')
- #driver = webdriver.Chrome('C:\Python27\Scripts\chromedriver')
-
- #读出Mysql中的评论页面,进行抓取
- # 连接数据库
- try:
- conn = MySQLdb.connect(host='localhost',user='root',passwd='',db='jd')
- except Exception, e:
- print e
- sys.exit()
-
- # 获取cursor对象
- cursor = conn.cursor()
- sql = "SELECT * FROM booknew ORDER BY pagenumber DESC"
- cursor.execute(sql)
- alldata = cursor.fetchall()
-
- flag = 0
- flag2 = 0
-
- # 如果有数据返回就循环输出,http://club.jd.com/review/10178500-1-154.html
- if alldata:
- for rec in alldata:
- #rec[0]--bookid,rec[1]--cateid,rec[2]--pagenumber
- if(rec[0] != str(num1) and flag == 0):
- continue
- else:
- flag = 1
- for p in range(num2,rec[2]):
- if(flag2 == 0):
- num2 = 0
- flag2 = 1
- p += 1
- link = "http://club.jd.com/review/" + rec[0] + "-1-" + str(p) + ".html"
- #抓网页
- driver.get(link)
- html = driver.page_source
- #抓评论
- buydate = catchDate(html)
- #写入数据库
- for z in buydate:
- sql = "INSERT INTO ljj (id, cateid, bookid, date) VALUES (NULL, '" + rec[0] + "','" + rec[1] + "','" + z[0] + "');"
- try:
- cursor.execute(sql)
- except Exception, e:
- print e
- conn.commit()
- print getTimes(nowtimes,totaltimes)
-
- driver.quit()
- cursor.close()
- conn.close()