下载地址:https://www.pan38.com/dow/share.php?code=JCnzE 提取密码:1158
这个工具包含三个主要功能:1) 获取用户基本信息 2) 获取用户发布的笔记 3) 获取笔记下的评论。代码实现了完整的爬虫逻辑,包括请求处理、数据解析和结果保存。使用时需要替换示例中的user_ids为实际要采集的小红书用户ID。
import requests
import json
import time
import random
import csv
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from urllib.parse import urlencode
class XiaohongshuSpider:
def init(self):
self.ua = UserAgent()
self.session = requests.Session()
self.base_url = "https://www.xiaohongshu.com"
self.headers = {
'Accept': 'application/json, text/plain, /',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Origin': self.base_url,
'Referer': f'{self.base_url}/',
'User-Agent': self.ua.random
}
self.proxies = None # 可自行配置代理
def get_user_info(self, user_id):
"""获取用户基本信息"""
url = f"{self.base_url}/user/profile/{user_id}"
try:
response = self.session.get(url, headers=self.headers, proxies=self.proxies)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
script = soup.find('script', {'type': 'application/ld+json'})
if script:
user_data = json.loads(script.string)
return {
'user_id': user_id,
'nickname': user_data.get('name', ''),
'description': user_data.get('description', ''),
'fans_count': user_data.get('interactionStatistic', {}).get('userInteractionCount', 0),
'follows_count': 0, # 需要从其他接口获取
'notes_count': 0 # 需要从其他接口获取
}
except Exception as e:
print(f"获取用户信息失败: {e}")
return None
def get_user_notes(self, user_id, limit=20):
"""获取用户发布的笔记"""
url = f"{self.base_url}/fe_api/burdens/weblog/profile/{user_id}/notes"
params = {
'page': 1,
'pageSize': limit,
'sortBy': 'time'
}
notes = []
try:
response = self.session.get(url, params=params, headers=self.headers, proxies=self.proxies)
if response.status_code == 200:
data = response.json()
if data.get('success'):
for note in data.get('data', {}).get('notes', []):
notes.append({
'note_id': note.get('id'),
'title': note.get('title'),
'desc': note.get('desc'),
'likes': note.get('likes'),
'collects': note.get('collects'),
'comments': note.get('comments'),
'time': note.get('time'),
'cover_url': note.get('cover', {}).get('url')
})
except Exception as e:
print(f"获取用户笔记失败: {e}")
return notes
def get_note_comments(self, note_id, limit=50):
"""获取笔记评论"""
url = f"{self.base_url}/fe_api/burdens/weblog/notes/{note_id}/comments"
params = {
'page': 1,
'pageSize': limit
}
comments = []
try:
response = self.session.get(url, params=params, headers=self.headers, proxies=self.proxies)
if response.status_code == 200:
data = response.json()
if data.get('success'):
for comment in data.get('data', {}).get('comments', []):
comments.append({
'comment_id': comment.get('id'),
'user_id': comment.get('user', {}).get('id'),
'nickname': comment.get('user', {}).get('nickname'),
'content': comment.get('content'),
'likes': comment.get('likes'),
'time': comment.get('time'),
'reply_count': comment.get('replyCount')
})
except Exception as e:
print(f"获取笔记评论失败: {e}")
return comments
def save_to_csv(self, data, filename):
"""保存数据到CSV文件"""
if not data:
return
keys = data[0].keys()
with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(data)
def run(self, user_ids):
"""主运行方法"""
all_users = []
all_notes = []
all_comments = []
for user_id in user_ids:
# 获取用户信息
user_info = self.get_user_info(user_id)
if user_info:
all_users.append(user_info)
print(f"成功获取用户信息: {user_info['nickname']}")
# 获取用户笔记
notes = self.get_user_notes(user_id)
if notes:
all_notes.extend(notes)
print(f"获取到{len(notes)}条笔记")
# 获取每条笔记的评论
for note in notes:
comments = self.get_note_comments(note['note_id'])
if comments:
all_comments.extend(comments)
print(f"获取到笔记{note['note_id']}的{len(comments)}条评论")
# 随机延迟,避免被封
time.sleep(random.uniform(1, 3))
# 保存数据
if all_users:
self.save_to_csv(all_users, 'users_info.csv')
if all_notes:
self.save_to_csv(all_notes, 'users_notes.csv')
if all_comments:
self.save_to_csv(all_comments, 'notes_comments.csv')
print("数据采集完成!")
if name == 'main':
spider = XiaohongshuSpider()
# 示例用户ID列表
user_ids = ['5f0a1234567890abcdef', '5f0b9876543210fedcba']
spider.run(user_ids)