概述
使用python爬取手机号码前缀7位、区号和地区。
小网站不容易,对爬虫也挺友好,就不放链接了。
代码
import requests from lxml import etree from fake_useragent import UserAgent import time def parse_page(url,header,cookie): """ 解析url,并写到文件中 """ resp = requests.get(url,headers=header,cookies=cookie) html = etree.HTML(resp.text) filename = "phonenum.txt" # 爬取手机前缀 phone_number_1 = html.xpath("//tr[@class='even']/td[1]/a/text()") # 爬取所在地区 phone_number_1_city = html.xpath("//tr[@class='even']/td[2]/text()") # 爬取地区区号 phone_number_1_citynum = html.xpath("//tr[@class='even']/td[4]/text()") phone_number_2 = html.xpath("//tr[@class='odd']/td[1]/a/text()") phone_number_2_city = html.xpath("//tr[@class='odd']/td[2]/text()") phone_number_2_citynum = html.xpath("//tr[@class='odd']/td[4]/text()") # 使用zip()组合爬取结果,并追加到文件中 for i,j,k in zip(phone_number_1,phone_number_1_citynum,phone_number_1_city): with open(filename,"a",encoding="utf-8") as f_obj: f_obj.write(f"{i},{j},{k}\n") for x,y,z in zip(phone_number_2,phone_number_2_citynum,phone_number_2_city): with open(filename,"a",encoding="utf-8") as f_obj: f_obj.write(f"{x},{y},{z}\n") def cookie_to_dict(cookie_src): cookie_dict = {} for i in cookie_src.split('; '): cookie_dict[i.split('=')[0]] = i.split('=')[1] return cookie_dict def main(): """ 执行主程序 """ ua = UserAgent() # 地址很简单,直接range for page in range(1,1234): # 网站地址 phone_url = "...........%d"%page phone_headers = {"User-Agent": ua.random} # 填写自己的cookie cookie_src = "" cookie = cookie_to_dict(cookie_src) parse_page(phone_url,phone_headers,cookie) print(f"page: {page}") # 暂停2秒,防止目标网站宕机 time.sleep(2) if __name__ == '__main__': main()
结果
爬取了463600条数据,部分数据如下:
... ... 1999451,0776,广西 百色 1999449,0772,广西 来宾 1999447,0774,广西 梧州 1999445,0778,广西 河池 1999443,0776,广西 百色 1999441,0771,广西 南宁 1999438,0931,甘肃 兰州 1999436,0931,甘肃 兰州 1999434,0943,甘肃 白银 1999432,0943,甘肃 白银 1999430,0943,甘肃 白银 1999418,0931,甘肃 兰州 1999416,0931,甘肃 兰州 1999414,0941,甘肃 甘南 1999412,0941,甘肃 甘南 ... ...