import requests
import re
import os
from parsel import Selector
from fake_useragent import UserAgent
import csv
import time, random
import json
class WisdomBagSearch(object):
def __init__(self):
self.MainUrl = "https://chiebukuro.yahoo.co.jp"
self.MaxNumPage = 2
def writeHtml(self, url, html):
with open(url, 'w', encoding='utf-8') as f:
f.write(str(html))
def GetHtml(self, Url):
header = {
"user-agent": UserAgent().random
}
try:
response = requests.get(Url, headers=header)
return response.text
except Exception as e:
print(e)
return None
def GetCategory(self, firstLayerUrl):
'''
:param firstLayerUrl: 第一层地址
:return: 返回所有类别的href值
'''
# 1,获取主页面的HTML
totleHtml = self.GetHtml(firstLayerUrl)
# 2, 获取模式并根据模式对HTML进行分析
totleSelector = Selector(text=totleHtml)
totleTiems = totleSelector.css(
'.ClapLv2CategoryList_Chie-CategoryList__Category2Wrapper__llQoL a::attr(href)').getall()
# print(totleTiems)
# print(CategoryHrefdata) 返回结果中途打印测试
# writeHtml("./test.txt", response.text) #写出爬取网页源码,测试是否符合预期
# 3,返回分析结果
return totleTiems
def preHanle_categoryHrefS(self, categoryHrefS):
'''
:param categoryHrefS: 所有类别的href
:return: 所有类别可以直接访问的的地址
'''
categoryUrls = []
for categoryHref in categoryHrefS:
categoryUrl = self.MainUrl + categoryHref + "?flg=1" # ?flg=1表示进去之后,选择已解决的页面,也就是Secondlayer
categoryUrls.append(categoryUrl)
return categoryUrls
def GetQuestionsUrls(self, categoryUrl):
'''
:param categoryUrl: 类别的地址,SecondLayer
:return: MaxNumPage页内的所有问题的Url
'''
# 1,对类别遍历一百次,每一个类别取得MaxNumPage*40个已解决的Url
questionUrls = []
for i in range(1, self.MaxNumPage + 1):
try:
# 2,获取翻页后类别的网页源码
NewcategoryUrl = categoryUrl + "&page=" + str(i)
NewcategoryHtml = self.GetHtml(NewcategoryUrl)
# 3,对源码分析并提取出当前页面所有回答
categorySelector = Selector(text=NewcategoryHtml)
categoryItems = categorySelector.css('.ClapLv3List_Chie-List__ListItem__y_P8W a::attr(href)').getall()
# print(categoryItems)
# 4,将分析对结果添加进questionUrls
questionUrls.append(categoryItems)
except Exception as e:
print(e)
# print(len([i for j in questionUrls for i in j])) #测试是否如预期添加成功
return [i for j in questionUrls for i in j] # 返回该类别内40000个问题的地址
def AnswersNormalization(self, anotherAnswerItems):
'''
:param anotherAnswerItems: 其他回答的文本
:return: 格式化,删除换行,超文本链接
'''
try:
if len(anotherAnswerItems) > 0:
anotherAnswerItems = list(anotherAnswerItems)
for i in range(len(anotherAnswerItems)):
anotherAnswerItems[i] = re.sub("<.*?>", '', anotherAnswerItems[i])
anotherAnswerItems[i] = anotherAnswerItems[i].replace('\n', '')
anotherAnswerItems[i] = anotherAnswerItems[i].replace('\r', '')
return anotherAnswerItems
except Exception as e:
print(type(e), e)
return None
def GetUserInfo(self, userInfoUrl):
'''
:param userInfoUrl: yahoo用户地址
:return: 用户登陆号,名字
'''
if type(userInfoUrl) != type('2') or len(userInfoUrl) < 35 or str(
userInfoUrl[:35]) != 'https://chiebukuro.yahoo.co.jp/user': # 非公开ID无法访问该用户
AnswerItems = ['None', 'D非公開さん']
return AnswerItems
userHtml = self.GetHtml(userInfoUrl)
AnswerSelector = Selector(text=userHtml)
AnswerItems = AnswerSelector.css('.ClapLv2MyProfile_Chie-MyProfile__ContentItem__DfPaV *::text').getall()
return [AnswerItems[1], AnswerItems[2]]
def GetUserInfo2(self, answerHtml):
answerUserInfoSelector = Selector(text=answerHtml)
answerUserUrl = answerUserInfoSelector.css(
'.ClapLv2AnswerItem_Chie-AnswerItem__ItemHead__Mvlc0 a::attr(href)').get()
loginnumber = os.path.basename(answerUserUrl)
userName = answerUserInfoSelector.css('.ClapLv1UserInfo_Chie-UserInfo__UserName__1bJYU *::text').get()
print(loginnumber)
print(userName)
return [loginnumber, userName]
def CheckQuestionStandard(self, questionInfomation):
return True
def GetAnswerInfo(self, answerHtml):
answer = {}
answerTextSelector = Selector(text=answerHtml)
answerText = answerTextSelector.css('.ClapLv1TextBlock_Chie-TextBlock__3X4V5 h2::text').getall()
# print(answerHtml)
NewanswerText = ""
for text in answerText:
NewanswerText += text
NewanswerText = NewanswerText.replace('\n', '')
NewanswerText = NewanswerText.replace('\r', '')
answerTimeSelector = Selector(text=answerHtml)
answerTime = answerTimeSelector.css('.ClapLv1UserInfo_Chie-UserInfo__Date__2F1LF *::text').get()
answerUserInfoSelector = Selector(text=answerHtml)
answerUserUrl = answerUserInfoSelector.css(
'.ClapLv2AnswerItem_Chie-AnswerItem__ItemHead__Mvlc0 a::attr(href)').get()
# self.GetUserInfo2(answerHtml)
# test
answerUserInfo = self.GetUserInfo2(answerUserUrl)
answerApprovelSelector = Selector(text=answerHtml)
answerApprove = answerApprovelSelector.css(
'.ClapLv1ReactionCounter_Chie-ReactionCounter__Text__1yosc *::text').get()
answerReplaySelector = Selector(text=answerHtml)
answerReplayItems = answerReplaySelector.css(".ClapLv3ReplyList_Chie-ReplyList__Item__33upu").getall()
replys = [] # 答案的回复模块
# print(len(answerReplayItems))
self.GetUserInfo2(answerHtml)
if len(answerReplayItems) != 0:
for answerReplay in answerReplayItems:
reply = {}
self.GetUserInfo2(answerReplay)
answerReplyTextSelector = Selector(text=answerReplay)
answerReplyText = answerReplyTextSelector.css(
'.ClapLv1TextBlock_Chie-TextBlock__3X4V5 *::text').getall()
NewanswerReplyText = ""
for text in answerReplyText:
NewanswerReplyText += text
NewanswerReplyText = NewanswerReplyText.replace('\n', '')
NewanswerReplyText = NewanswerReplyText.replace('\r', '')
answerReplyTimeSelector = Selector(text=answerHtml)
answerReplyTime = answerReplyTimeSelector.css(
'.ClapLv1UserInfo_Chie-UserInfo__DateSmall__3erUK *::text').get()
answerReplyUserInfoSelector = Selector(text=answerHtml)
answerUserReplyUrl = answerReplyUserInfoSelector.css(
'.ClapLv2ReplyItem_Chie-ReplyItem__ItemHead__HrG5K a::attr(href)').get()
answerUserReplyInfo = self.GetUserInfo2(answerUserReplyUrl)
answerUserReplyInfoExp = {'Loginnumber': answerUserReplyInfo[0],
'UserName': answerUserReplyInfo[1]} # 对用户信息进行解析
reply['ReplyText'] = NewanswerReplyText
reply['ReplyTime'] = answerReplyTime
reply['ReplyUserInfo'] = answerUserReplyInfoExp
replys.append(reply)
answer['AnswerText'] = NewanswerText
answer['AnswerTime'] = answerTime
answer['AnswerUserInfo'] = {'Loginnumber': answerUserInfo[0],
'UserName': answerUserInfo[1]}
answer['AnswerApprove'] = answerApprove
answer['AnswerReply'] = replys
# print(answer)
if answerApprove == None:
answer['AnswerApprove'] = 0
return answer
def GetAnswersInfo(self, answerUrl):
'''
:param answerUrl: 问题的地址
:return: 所有回答的内容,时间,用户账户信息,点赞数
'''
answers = []
answerHtml = self.GetHtml(answerUrl)
answerTextSelector = Selector(text=answerHtml)
answerHtml = answerTextSelector.css('#ba .ClapLv3BestAnswer_Chie-BestAnswer__1kJ7F').get()
# print(answerHtml)
baAnswerInfo = self.GetAnswerInfo(answerHtml)
try:
baAnswerInfo['AnswerText'] = baAnswerInfo['AnswerText'][7:]
except Exception as e:
print(e)
answers.append(baAnswerInfo)
# print(baAnswerInfo)
pageReply = (int(self.GetOtherAnswerNumber(answerUrl)) - 1) // 5 + 1 # 多页访问回复
for i in range(1, pageReply + 1):
NewPageReplyUrl = answerUrl + '?sort=1&page=' + str(i)
questionHtml = self.GetHtml(NewPageReplyUrl)
AnswerSelector = Selector(text=questionHtml)
AnswerItems = AnswerSelector.css('#ans .ClapLv3AnswerList_Chie-AnswerList__Item__2PxD4').getall()
# print(AnswerItems)
for answerHtml in AnswerItems: # 切割出当前页全部回答
answerInfo = self.GetAnswerInfo(answerHtml)
answers.append(answerInfo)
return answers
def GetQuestionMessage(self, questionUrl):
questionInformation = [] # 将解析出来的信息,按用户名,时间,回答依次存储
try:
# 1
questionHtml = self.GetHtml(questionUrl)
self.writeHtml('./test12.txt', questionHtml)
# 2
authorNameSelector = Selector(text=questionHtml)
authorUrl = authorNameSelector.css(
'#que .ClapLv2QuestionItem_Chie-QuestionItem__Head__1ZglB a::attr(href)').get()
authorInfo = self.GetUserInfo(authorUrl)
authorInfo = {'Loginnumber': authorInfo[0], 'UserName': authorInfo[1]}
authorTimeSelector = Selector(text=questionHtml)
authorTimeItems = authorTimeSelector.re('itemprop="dateCreated".*?>(.*?)</')
authorQuestionSelector = Selector(text=questionHtml)
authorQuestionItems = authorQuestionSelector.css(
'#que .ClapLv1TextBlock_Chie-TextBlock__3X4V5 *::text').getall()
queText = authorQuestionItems[0]
queGlan = authorQuestionItems[-2] # 问题浏览量
authorQuestionlabelSelector = Selector(text=questionHtml) # 问题标签
authorQuestionLabelItems = authorQuestionlabelSelector.css(
'#que .ClapLv2QuestionItem_Chie-QuestionItem__SubAnchor__2Pv8w *::text').getall()
authorThankSelector = Selector(text=questionHtml) # 问题标签
authorThankItems = authorThankSelector.css(
'.ClapLv3BestAnswer_Chie-BestAnswer__Thanks__1ASeS *::text').getall()
awardMoneyQuestionSelector = Selector(text=questionHtml)
awardMoneyQuestionItems = awardMoneyQuestionSelector.css(
'#que .ClapLv2QuestionItem_Chie-QuestionItem__SubChieCoin__2akxj *::text').getall()
awardMoney = 0
if len(awardMoneyQuestionItems) == 2:
awardMoney = awardMoneyQuestionItems[1]
baAnswerSelector = Selector(text=questionHtml)
baAnswerSelectorItems = baAnswerSelector.css('#ba .ClapLv1TextBlock_Chie-TextBlock__3X4V5 *::text').getall()
texts = []
texts.append(queText)
if 0 < len(baAnswerSelectorItems) <= 2: # 特判最佳回答是否存在,以及特判是否有提问者感谢内容
bestAnswer = baAnswerSelectorItems[1]
texts.append(bestAnswer)
else:
bestAnswer = baAnswerSelectorItems[1]
thanksAnswer = baAnswerSelectorItems[-3]
thanksAnswerTime = baAnswerSelectorItems[-1]
authorTimeItems.append(thanksAnswerTime)
texts.append(bestAnswer)
texts.append(thanksAnswer)
anotherAnswerSelector = Selector(text=questionHtml)
anotherAnswerItems = anotherAnswerSelector.css('#ans .ClapLv1TextBlock_Chie-TextBlock__3X4V5').getall()
for anotherAnswer in anotherAnswerItems:
texts.append(anotherAnswer)
questionInformation.append(texts[0]) # 问题内容
questionInformation.append(authorQuestionLabelItems) # 问题标签
questionInformation.append(authorTimeItems[0]) # 提问时间
questionInformation.append(authorInfo) # 提问者账户信息
questionInformation.append(questionUrl) # 问题URL
questionInformation.append(awardMoney) # 悬赏数
answers = self.GetAnswersInfo(questionUrl) # 回答
if len(authorThankItems) > 4: # 特判用户答谢部分是否存在
ThansInfo = {}
ThansInfo['AnswerText'] = authorThankItems[2].replace('\n', '')
ThansInfo['AnswerTime'] = authorThankItems[4]
ThansInfo['AnswerUserInfo'] = authorInfo
ThansInfo['AnswerApprove'] = 0
answers.append(ThansInfo)
questionInformation.append(answers) # 加入用户回答
return questionInformation
except Exception as e:
# print(e,'bbbb')
return []
def WriteInCsv(self, csvSaveUrl, questionInformation):
dataList = []
for question, user, time, text in zip(*questionInformation):
if user == '' or time == '' or text == '': # 丢弃脏数据
continue
simpleInfomation = {}
simpleInfomation['Question'] = question
simpleInfomation['Time'] = time
simpleInfomation['UserName'] = user
simpleInfomation['Text'] = text
dataList.append(simpleInfomation)
fieldnames = ['Question', 'Time', 'UserName', 'Text']
with open(csvSaveUrl, 'a+', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerows(dataList)
def WriteJson(self, jsonUrl, questionInformation):
fieldnames = ['QuestionText', 'QuestionLabel', 'QuestionTime', 'QuestionerInformation', 'QuestionUrl',
'OfferReward', 'Answers']
messageItem = {}
try:
if len(fieldnames) == len(questionInformation):
for i in range(len(fieldnames)):
messageItem[fieldnames[i]] = questionInformation[i]
json.dump(messageItem, open(jsonUrl, 'a+', encoding='utf-8'), indent=2, ensure_ascii=False)
except:
pass
def GetCateName(self, categoryUrl):
categoryHtml = self.GetHtml(categoryUrl)
cateNameSelector = Selector(text=categoryHtml)
cateName = cateNameSelector.css('.ClapLv2Title_Chie-Title__TextWrapper__1ccaf h1::text').get()
return cateName
def GetOtherAnswerNumber(self, answerUrl):
'''
:param answerUrl: 问题网页的资源定位符
:return:该问题下的其他回复数
'''
answerHtml = self.GetHtml(answerUrl)
answerAnswerSelector = Selector(text=answerHtml)
answerNumber = answerAnswerSelector.css(
'.ClapLv2QuestionItem_Chie-QuestionItem__AnswerNumber__3_0RR *::text').get()
if answerNumber == None: # 特判无法找到的情况
answerNumber = 0
return answerNumber
def main():
WBSearch = WisdomBagSearch()
workPath = './Plato44'
if not os.path.exists(workPath):
os.mkdir(workPath)
firstLayerUrl = WBSearch.MainUrl + "/category"
categoryHrefS = WBSearch.GetCategory(firstLayerUrl) # 获取主页面下全部类别的href
secondLayerUrls = WBSearch.preHanle_categoryHrefS(categoryHrefS) # 对全部类别进行预处理成可以直接跳转的URL
print("--启动YAHOO知惠网网络爬虫--")
for secondLayerUrl in secondLayerUrls:
print(secondLayerUrl)
questionUrls = WBSearch.GetQuestionsUrls(secondLayerUrl) # 记录一个类别其中前100页的所有能访问的问题URL
categoryName = WBSearch.GetCateName(secondLayerUrl)
print('正在爬取类别:' + categoryName)
with open(os.path.join(workPath, categoryName + ".json"), 'w', encoding='utf-8') as w:
s = 1 # 创建json文件
for questionUrl in questionUrls:
print(questionUrl)
try:
time.sleep(random.random())
questionInformation = WBSearch.GetQuestionMessage(
questionUrl) # 对每个问题下的html进行解析,提取出提问者名字,时间,内容。回答者们的名字,时间,内容
# WriteInCsv(secondLayerAddess,questionInformation)
print(questionInformation)
if questionInformation == 0:
continue
except Exception as e:
print(type(e), e)
jsonUrl = os.path.join(workPath, categoryName + ".json")
if questionInformation != 0:
if WBSearch.CheckQuestionStandard(questionInformation):
WBSearch.WriteJson(jsonUrl, questionInformation)
def Search(Url):
WBSearch = WisdomBagSearch()
workPath = './Plato43'
secondLayerUrl = Url # 记录一个类别其中前100页的所有能访问的问题URL
categoryName = WBSearch.GetCateName(secondLayerUrl)
print(categoryName)
if not os.path.exists(workPath):
os.mkdir(workPath)
print(secondLayerUrl)
questionUrls = WBSearch.GetQuestionsUrls(secondLayerUrl) # 记录一个类别其中前100页的所有能访问的问题URL
categoryName = WBSearch.GetCateName(secondLayerUrl)
print('正在爬取类别:' + categoryName)
with open(os.path.join(workPath, categoryName + ".json"), 'w', encoding='utf-8') as w:
s = 1 # 创建json文件
for questionUrl in questionUrls:
print(questionUrl)
# test
try:
time.sleep(random.random())
questionInformation = WBSearch.GetQuestionMessage(questionUrl) # 对每个问题下的html进行解析,提取出提问者名字,时间,内容。回答者们的名字,时间,内容
# WriteInCsv(secondLayerAddess,questionInformation)
print(questionInformation)
if questionInformation == 0:
continue
except Exception as e:
print(type(e), e)
jsonUrl = os.path.join(workPath, categoryName + ".json")
if questionInformation != 0:
if WBSearch.CheckQuestionStandard(questionInformation):
WBSearch.WriteJson(jsonUrl, questionInformation)
if __name__ == "__main__":
main()
Url = "https://chiebukuro.yahoo.co.jp/category/2078675272/question/list?flg=1"
#Search(Url)