下载地址:https://www.pan38.com/share.php?code=JCnzE 提取密码:8888
这个项目包含三个主要模块:爬虫核心、数据处理和代理管理。使用时需要安装requests、beautifulsoup4、pandas等库。请注意遵守目标网站的robots.txt和服务条款。
import requests
from bs4 import BeautifulSoup
import json
import time
import random
from fake_useragent import UserAgent
class XiaoHongShuSpider:
def init(self):
self.session = requests.Session()
self.ua = UserAgent()
self.base_url = "https://www.xiaohongshu.com"
self.headers = {
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive'
}
def get_random_header(self):
return {
'User-Agent': self.ua.random,
'Referer': self.base_url
}
def get_note_detail(self, note_id):
url = f"{self.base_url}/explore/{note_id}"
try:
response = self.session.get(url, headers=self.get_random_header())
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 解析页面内容
title = soup.find('h1').text if soup.find('h1') else ''
content = soup.find('div', class_='content').text if soup.find('div', class_='content') else ''
return {
'title': title,
'content': content,
'note_id': note_id
}
return None
except Exception as e:
print(f"Error fetching note {note_id}: {str(e)}")
return None
def get_live_comments(self, live_id):
api_url = f"{self.base_url}/api/live/{live_id}/comments"
try:
response = self.session.get(api_url, headers=self.get_random_header())
if response.status_code == 200:
return response.json()
return None
except Exception as e:
print(f"Error fetching live comments {live_id}: {str(e)}")
return None
def search_keyword(self, keyword, page=1):
params = {
'keyword': keyword,
'page': page
}
try:
response = self.session.get(
f"{self.base_url}/search_api/v1/search",
params=params,
headers=self.get_random_header()
)
if response.status_code == 200:
return response.json()
return None
except Exception as e:
print(f"Error searching {keyword}: {str(e)}")
return None
if name == "main":
spider = XiaoHongShuSpider()
# 示例用法
note_data = spider.get_note_detail("123456789")
print(note_data)
time.sleep(random.uniform(1, 3))
comments = spider.get_live_comments("987654321")
print(comments)
import pandas as pd
from datetime import datetime
class DataProcessor:
@staticmethod
def save_to_csv(data, filename):
df = pd.DataFrame(data)
df['crawl_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df.to_csv(filename, index=False, encoding='utf_8_sig')
print(f"Data saved to {filename}")
@staticmethod
def filter_keyword_comments(comments, keywords):
if not comments or not isinstance(comments, list):
return []
filtered = []
for comment in comments:
if any(keyword.lower() in comment.get('content', '').lower()
for keyword in keywords):
filtered.append(comment)
return filtered
@staticmethod
def analyze_user_behavior(notes_data):
if not notes_data:
return {}
df = pd.DataFrame(notes_data)
analysis = {
'total_notes': len(df),
'avg_title_length': df['title'].str.len().mean(),
'avg_content_length': df['content'].str.len().mean(),
'top_keywords': DataProcessor._extract_keywords(df['content'].str.cat(sep=' '))
}
return analysis
@staticmethod
def _extract_keywords(text, top_n=10):
# 简单的关键词提取逻辑
from collections import Counter
import jieba
words = jieba.cut(text)
return Counter(words).most_common(top_n)
import requests
from threading import Lock
class ProxyManager:
def init(self, api_url=None):
self.proxies = []
self.current_index = 0
self.lock = Lock()
self.api_url = api_url
def load_proxies(self, file_path=None):
if file_path:
with open(file_path, 'r') as f:
self.proxies = [line.strip() for line in f if line.strip()]
elif self.api_url:
try:
response = requests.get(self.api_url)
if response.status_code == 200:
self.proxies = response.json().get('data', [])
except Exception as e:
print(f"Error loading proxies from API: {str(e)}")
def get_proxy(self):
if not self.proxies:
return None
with self.lock:
proxy = self.proxies[self.current_index]
self.current_index = (self.current_index + 1) % len(self.proxies)
return {
'http': f'http://{proxy}',
'https': f'http://{proxy}'
}
def check_proxy_available(self, proxy):
try:
test_url = "http://www.baidu.com"
response = requests.get(
test_url,
proxies=proxy,
timeout=5
)
return response.status_code == 200
except:
return False
def validate_all_proxies(self):
available_proxies = []
for proxy in self.proxies:
formatted = {
'http': f'http://{proxy}',
'https': f'http://{proxy}'
}
if self.check_proxy_available(formatted):
available_proxies.append(proxy)
self.proxies = available_proxies
return len(available_proxies)