提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
前言
一、历史天气从哪获取?
www.tianqihoubao.com
二、使用步骤
1.引入库
import requests, json, re, os, sys, datetime,time
import traceback,random
import tianqiSqlite
from urllib.parse import urlparse
from contextlib import closing
from urllib.request import urlopen
from bs4 import BeautifulSoup
2.代码编写
使用到了代理ip,避免被拒
class Tianqi(object):
def __init__(self):
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
self.domain = ['www.tianqihoubao.com']
self.base_url = 'http://tianqihoubao.com'
self.proxies = {}
def run(self):
self.getHtml(self.base_url,None)
#print(self.html)
soup_p = BeautifulSoup(self.html,'html.parser')
provices = soup_p.find('table',cellpadding='1')
if not provices:
return False
provices = provices.select('tr td')
p_num = 0
for provice in provices:
p_num = p_num + 1
#print(provice)
p_name = provice.a.get_text()
print('开始查询:'+ p_name + '天气')
p_url = self.base_url + '/' + provice.a.get('href')
#p_title = provice.a.get('title')
#print(str(p_num) + ':' + p_name + ':' + p_url)
self.getHtml(p_url,None)
soup_city = BeautifulSoup(self.html,'html.parser')
citys = soup_city.find('table',cellpadding='1')
if not citys:
continue
citys = citys.select('tr td')
c_num = 0
for city in citys:
c_num = c_num + 1
if p_num < 24 or p_num == 24 and c_num < 5:
continue
c_name = city.a.get_text()
print('开始查询:'+ p_name + '|' + c_name + '天气信息')
c_url = self.base_url + '/' + city.a.get('href')
c_url = c_url.replace('top','lishi')
print(str(p_num) +'>'+p_name +'省: '+str(c_num)+ '>' +c_name+ '市: 地址' +c_url)
self.getHtml(c_url,None)
soup_years = BeautifulSoup(self.html,'html.parser')
years = soup_years.find('div',id='content')
months = years.select('ul li a')
for month in months:
url = month['href']
if not url.startswith('/'):
url = self.base_url + '/lishi/' + url
else:
url = self.base_url + url
print(url)
self.getHtml(url,None)
soup_months = BeautifulSoup(self.html,'lxml')
days = soup_months.select('table tr')
#print(days)
for day in days:
tds = day.select('td')
if not tds[0].a:
continue
rq = tds[0].get_text().strip().replace('/', '|').replace(' ', '')
tq = tds[1].get_text().strip().replace('/', '|').replace(' ', '')
wd = tds[2].get_text().strip().replace('/', '|').replace(' ', '')
fx = tds[3].get_text().strip().replace('/', '|').replace(' ', '')
#print(p_name +'|'+ c_name )
if(self.getOne(p_name,c_name,rq) > 0):
print(p_name +c_name + rq + '已存在')
continue
insertsql = 'insert into history_tianqi (`province`,`city`, `t_time`, `tq`,`wd`,`fx`) values (?,?,?,?,?,?)'
data = (p_name,c_name,rq,tq,wd,fx)
tianqiSqlite.saveInfo(insertsql,data)
#return