获取数据
首先要通过抓包分析网站的数据接口是什么,再构造请求头,发送请求,解析数据,最后保存数据。本项目以某宝上的一个商品的评论数据为例进行获取。
爬虫主代码:
import requests import time import re import random # 爬虫主函数 def main(i): # 构造请求信息 url = 'https://rate.tmall.com/list_detail_rate.htm?' headers = { 'cookie':'lid=tb482754983; enc=OLmCxk0zYIWAaWbxiJGJZLgZXFhpOoFAZRIv5/YHmby4uo3ck9KqKs9vMh3nkV/Jm+VnJn3St+k/JAYQyySRgA==; cna=QOqsGqOLgWQCAduQ6z6UauuL; hng=CN|zh-CN|CNY|156; t=1882e35fc53e83187105f94271d6bd06; tracknick=tb482754983; _tb_token_=fb3e107ee33b3; cookie2=1d6a5bb763ab9937769bf7ce6646e4a2; xlly_s=1; dnk=tb482754983; uc1=existShop=false&cookie21=VT5L2FSpczFp&cookie14=UoexNgAlbNcuGA==&cookie16=U+GCWk/74Mx5tgzv3dWpnhjPaQ==&cookie15=WqG3DMC9VAQiUQ==&pas=0; uc3=id2=Vy6xyuVKA3qrYw==&vt3=F8dCvC32qZoKEHKE5qg=&lg2=VT5L2FSpMGV7TQ==&nk2=F5RBzefNa4UWMMc=; _l_g_=Ug==; uc4=nk4=0@FY4KqBPwZ/gfx5FZc1dyxKDkL1BGNw==&id4=0@VXkWTZ2Lyk2O9F7hKk47YVusHuHA; unb=4294095874; lgc=tb482754983; cookie1=B0f1tItLZyRaW/Jg29jakLzOxwmDYfDw97vOqX1S6HQ=; login=true; cookie17=Vy6xyuVKA3qrYw==; _nk_=tb482754983; sgcookie=E100jgHkBJlgEW69L0B4WcnHonQG2ehPnMKF3v/irFogNyJmFMCO0gLU4Yqtk8A47RLN5ZkcmuKH/6NyZssrLg61tB92O4vWJlRc3G1cd9C1iUHDMK2pvF78erekZqEFUUDR; cancelledSubSites=empty; sg=34f; csg=b70d899f; l=eBMZ7wCRLrSWauuFBO5Cnurza77t3BdbzrVzaNbMiIncC6WlZlpTXrtQ0eVOoKxRR8XVMILB4ouuxKeTCFP4JyMfoTB7K9cdvdhvCe8C.; tfstk=cYlGBOOZvAy_unynFCN6MPo14xHcafvafjlETFrmcslSQEl87sD1LT5vQeq9VhWf.; isg=BBAQxDBeRg4D5xq7-fl-Ms924V5i2fQjCegEmArmQ2lmRbLvsupis8IzHQ2lk6z7', 'referer': 'https://detail.tmall.com/item.htm?id=637890427701&price=299-499&sourceType=item&sourceType=item&suid=b849b43c-8864-4fc2-93d3-6c53552cc4b3&ut_sk=1.YndCWzm1S4EDACmWYiA%20yUls_21646297_1654485572615.TaoPassword-WeiXin.ShareGlobalNavigation_1&un=05732c75a575c0438c3bcb35c6d70f6c&share_crt_v=1&un_site=0&spm=a2159r.13376460.0.0&sp_abtk=gray_ShareGlobalNavigation_1_code_simpleAndroid&sp_tk=c0pGdTJtN0xkRmQ=&cpp=1&shareurl=true&short_name=h.fFEW3Y3&bxsign=scdtSlQKTciks4BYaWTlbr_JoHmHxyCFwvpZQBH9JQ5rcCkchEvkNKAaXmco9RLfPT4wYtX3csQoDLacARXGbaAbQdl4dAQTFBYuSB2ubf4Gj1fWxxGhGrqjolWwPVi--8ZUxwhhWnQEQTLYhDtv5rycw&tk=sJFu2m7LdFd&app=chrome', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.35 Safari/537.36' } params = { 'itemId':'637890427701', 'spuId':'1952440250', 'sellerId':'446338500', 'order':'3', 'currentPage':f'{i}', 'append':'0', 'content':'1', 'tagId':'', 'posi':'', 'picture':'', 'groupId':'', 'ua':'098#E1hv2QvZv7pvjQCkvvvvvjiWRLFZQjlbPsMUgjivPmPytjDvPsdw0jDnRLFpQjA+vpvEphUkVbWvpHCNdvhvhZ38GCvOvhCjFwaYZfv1veWA39hvChCCvvmevpvhphvhHvvCvvXvovvvvvmgvpvIphvvvvvvphCvpC9vvvCCV6CvVvvvvhWFphvOvvvvpznvpC9vvvC2wg9CvvXmp99h5EAIvpvUphvhC3AF/bIUvpCWpL/gv8RJEct1B57++ul1oB61iNpfVjxhfCuYiLUpwh+Fp+0xhE3zLLEc6aZtn0vENZqOQC+7nDyiLO2v5fh3ZkZH1RvaRoxBnZJt9vgCvvLMMQvvRvhvChCvvvm+vpvEphVHHn9vph8pdvhvmZCmOCBDvhC2g4QCvvDvpDhZw9CvBoA+vpvZo6DXhE6vpHu1/z6ERfvKo9Hl6we+vpvEphUwhvGvpH1rdvhvmZCmHCmXvhCNV8QCvvDvpXwWw9CvO3JvvpvZzP1Hcu2NznswOHlftQfweaQH7e9gvpvhphvvvv==', 'needFold':'0', '_ksTS':str(time.time()*1000).replace('.','_'), 'callback': 'jsonp2133' } # 发送请求 resp = requests.get(url,headers=headers,params=params) # 提取出全部的评论数据 comment_list = re.findall(r'"rateContent":"(.*?)",',resp.text) for item in comment_list: print(item) f.write(item) # 写入文件 f.write('\n') print(f'第{i}页爬取完毕') # 程序入口函数 if __name__ == '__main__': # 创建txt文件储存数据 with open('comment.txt','a',encoding='utf-8')as f: # 抓取10页评论数据 for i in range(1,11): main(i) time.sleep(5 + random.random()*10) # 防止检测
获取的数据如下:
词云图展示
对于文本类型的数据我们一般都是进行统计词频和词云图分析
主要步骤是使用jieba库进行分词处理,统计词频,绘制词云图
代码如下:
# coding=utf-8 import jieba import collections import re import stylecloud from PIL import Image import matplotlib.pylab as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') plt.rcParams['font.sans-serif'] = ['SimHei'] #解决中文显示 plt.rcParams['axes.unicode_minus'] = False #解决符号无法显示 # 打开需要分析词频的文本 with open('comment.txt', encoding='utf-8') as f: data = f.read() # 文本预处理 :去除一些无用的字符只提取出中文出来 new_data = re.findall('[\u4e00-\u9fa5]+', data, re.S) new_data = "/".join(new_data) # 文本分词 seg_list_exact = jieba.cut(new_data, cut_all=True) result_list = [] with open('停用词库.txt', encoding='utf-8') as f: #可根据需要打开停用词库,然后加上不想显示的词语 con = f.readlines() stop_words = set() for i in con: i = i.replace("\n", "") # 去掉读取每一行数据的\n stop_words.add(i) for word in seg_list_exact: if word not in stop_words and len(word) > 1: result_list.append(word) # print(result_list) word_counts = collections.Counter(result_list) # 词频统计:获取前10最高频的词 word_counts_top = word_counts.most_common(10) print(word_counts_top) x = [x[0] for x in word_counts_top] y = [y[1] for y in word_counts_top] sns.barplot(x=x,y=y) plt.xlabel('词语') plt.ylabel('出现的次数') plt.title('词频统计前十名展示') plt.show() # 绘制词云图 stylecloud.gen_stylecloud(text=' '.join(result_list[:500]), # 提取500个词进行绘图 collocations=False, # 是否包括两个单词的搭配(二字组) font_path=r'C:\Windows\Fonts\msyh.ttc', #设置字体,参考位置为 C:\Windows\Fonts\ ,根据里面的字体编号来设置 size=800, # stylecloud 的大小 palette='cartocolors.qualitative.Bold_7', # 调色板,调色网址: https://jiffyclub.github.io/palettable/ background_color='black', # 背景颜色 icon_name='fas fa-circle', # 形状的图标名称 蒙版网址:https://fontawesome.com/icons?d=gallery&p=2&c=chat,shopping,travel&m=free gradient='horizontal', # 梯度方向 max_words=2000, # stylecloud 可包含的最大单词数 max_font_size=200, # stylecloud 中的最大字号 stopwords=True, # 布尔值,用于筛除常见禁用词 output_name='词云图.png') # 输出图片 # 打开图片展示 img=Image.open('词云图.png') img.show()
结果如下:
[('不错', 66), ('鞋子', 45), ('舒服', 44), ('鞋底', 35), ('价格', 32), ('质量', 29), ('穿着', 28), ('舒适', 26), ('合适', 24), ('材质', 23)]
从词云图中我们可以看出消费者的评价主要集中在鞋子、鞋底、、质量、价格、服务等进行了评价,舒服。舒适、不错成为了主流趋势。说明消费者在线上购买这款鞋子的主要满足感来自商品的舒适度以及价格等因素,商家在制作以及销售的过程中应该重点在这几个方面进行加强突破。