Python采集淘宝店铺所有商品API接口指南
淘宝没有公开的官方API供采集店铺商品数据,但可以通过以下几种方法获取商品信息。需要注意的是,淘宝有严格的反爬机制,直接采集可能违反其服务条款。
方法一:通过淘宝客API(推荐合法方式)
淘宝客API是淘宝官方提供的合法数据接口,需要申请权限:
python import requests import time import hashlib import json from urllib.parse import quote class TaobaoShopScraper: def __init__(self, app_key, app_secret): self.app_key = app_key self.app_secret = app_secret self.api_url = "http://gw.api.taobao.com/router/rest" def _generate_sign(self, params): """生成签名""" string = self.app_secret for key in sorted(params.keys()): string += f"{key}{params[key]}" string += self.app_secret return hashlib.md5(string.encode('utf-8')).hexdigest().upper() def get_shop_items(self, shop_id, page_size=20, page_no=1): """ 通过淘宝客API获取店铺商品 需要先申请淘宝客权限 """ params = { "method": "taobao.tbk.shop.get", "app_key": self.app_key, "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "format": "json", "v": "2.0", "sign_method": "md5", "fields": "user_id,shop_title,shop_url,shop_type,pict_url", "q": shop_id, # 可以是店铺昵称或关键词 "page_no": page_no, "page_size": page_size } # 获取店铺基本信息后,再获取商品 # 实际需要分两步:1.搜索店铺 2.获取店铺商品 # 这里简化示例,实际需要更复杂逻辑 params["sign"] = self._generate_sign(params) try: response = requests.get(self.api_url, params=params) response.raise_for_status() return response.json() except Exception as e: print(f"API请求失败: {e}") return None def get_shop_items_by_seller(self, seller_id): """ 通过卖家ID获取商品(需要高级权限) """ params = { "method": "taobao.tbk.shop.recommend.get", "app_key": self.app_key, "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "format": "json", "v": "2.0", "sign_method": "md5", "fields": "num_iid,title,pict_url,small_images,reserve_price,zk_final_price,user_type,provcity,item_url,seller_id,volume", "user_id": seller_id, "count": 20 } params["sign"] = self._generate_sign(params) try: response = requests.get(self.api_url, params=params) response.raise_for_status() return response.json() except Exception as e: print(f"API请求失败: {e}") return None # 使用示例(需要替换为你的app_key和app_secret) # scraper = TaobaoShopScraper("your_app_key", "your_app_secret") # data = scraper.get_shop_items_by_seller("淘宝卖家ID") # print(json.dumps(data, indent=2, ensure_ascii=False))
方法二:分析网页请求获取数据(反爬风险高)
python import requests import re import json from urllib.parse import quote class TaobaoWebScraper: def __init__(self): self.session = requests.Session() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Referer': 'https://shopsearch.taobao.com/', } def search_shop_items(self, shop_name, page=1): """ 通过店铺名称搜索商品(模拟网页请求) """ url = f"https://shopsearch.taobao.com/search?app=shopsearch&q={quote(shop_name)}&s={(page-1)*20}" try: response = self.session.get(url, headers=self.headers, timeout=10) response.raise_for_status() # 淘宝网页版返回的是HTML,需要解析或查找其中的JSON数据 # 实际淘宝使用动态加载,可能需要更复杂的处理 # 示例:查找页面中的JSON数据(可能不适用最新版淘宝) match = re.search(r'g_page_config = (.*?);\n', response.text) if match: data = json.loads(match.group(1)) return data return None except Exception as e: print(f"请求失败: {e}") return None def get_shop_all_items(self, shop_id): """ 获取店铺所有商品(需要分析实际API) 注意:淘宝网页版通常使用分页加载,没有直接获取"所有商品"的API """ all_items = [] page = 1 while True: # 实际API端点需要通过浏览器开发者工具查找 # 以下为示例,实际不可用 api_url = f"https://shop{shop_id}.taobao.com/i/asynSearch.htm?mid=w-{shop_id}-0&wid={shop_id}&path=/search.htm&search=y&pageNo={page}" try: response = self.session.get(api_url, headers=self.headers, timeout=10) response.raise_for_status() # 解析响应数据 # 实际淘宝返回的是HTML片段或JSONP数据 # 示例解析(需要根据实际响应调整) if "没有找到" in response.text: break # 假设返回的是JSON数据 data = response.json() items = data.get('items', []) if not items: break all_items.extend(items) page += 1 # 礼貌性延迟 time.sleep(2) except Exception as e: print(f"获取第{page}页失败: {e}") break return all_items # 使用示例 # scraper = TaobaoWebScraper() # items = scraper.get_shop_all_items("店铺ID") # print(f"找到 {len(items)} 件商品")
方法三:使用Selenium模拟浏览器(适合动态页面)
python from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.commonby import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import time import json def get_shop_items_with_selenium(shop_url, max_pages=5): """ 使用Selenium获取店铺商品 """ chrome_options = Options() chrome_options.add_argument("--headless") # 无头模式 chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") driver = webdriver.Chrome(options=chrome_options) driver.set_page_load_timeout(30) all_items = [] try: driver.get(shop_url) for page in range(1, max_pages + 1): try: # 等待商品列表加载 WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".item-list .item")) ) # 执行JS获取页面数据(淘宝可能使用JSONP或动态加载) items_data = driver.execute_script(""" // 这里需要根据淘宝实际使用的数据结构调整 // 可能是window.runParams或其它全局变量 try { return window.__PAGE_DATA__ || {}; } catch(e) { return {}; } """) # 解析商品数据(示例) if items_data and 'items' in items_data: all_items.extend(items_data['items']) # 查找并点击下一页按钮 next_page = driver.find_elements(By.CSS_SELECTOR, ".pagination a.next") if next_page: next_page[0].click() time.sleep(3) # 等待页面加载 else: break except Exception as e: print(f"处理第{page}页时出错: {e}") break return all_items finally: driver.quit() # 使用示例 # shop_url = "https://shop12345678.taobao.com" # 替换为实际店铺URL # items = get_shop_items_with_selenium(shop_url) # print(json.dumps(items, indent=2, ensure_ascii=False))