本文以西安医学院-校长信箱为基础来展示爬虫案例。
以下就是我们目标要爬取的数据。这个要考虑到分页情况。
运行截图:
最关键的是你如何分析页面的元素,然后对应取值出来即可。
这是爬取后存入excel:
教学代码:
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
import time
def parse_page(url, ws):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('div', class_='content').find('table')
rows = table.find_all('tr')[1:] # 第一行是表头,不需要
for row in rows:
cols = row.find_all('td')
if len(cols) < 5: # 确保列数符合预期
continue
query_code = cols[1].text.strip()
title = cols[2].text.strip()
submit_time = cols[3].text.strip()
process_status = cols[4].text.strip()
ws.append([query_code, title, submit_time, process_status])
print("查询码:", query_code)
print("标题:", title)
print("提交时间:", submit_time)
print("处理状态:", process_status)
print("-" * 50)
def main():
wb = Workbook()
ws = wb.active
ws.append(["查询码", "标题", "提交时间", "处理状态"])
base_url = "https://www.xiyi.edu.cn/gzcylist.jsp"
page_num = 1
while True:
url = f"{base_url}?totalpage=101&PAGENUM={page_num}&urltype=tree.TreeTempUrl&wbtreeid=1172"
print(f"正在爬取第 {page_num} 页...")
parse_page(url, ws)
wb.save("crawl_data.xlsx")
time.sleep(2) # 休息2秒
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
next_button = soup.find('span', class_='p_next')
if next_button is None:
break # 如果下一页按钮不存在,结束循环
page_num += 1
print("数据已保存到 crawl_data.xlsx 文件中。")
if __name__ == "__main__":
main()