#!/usr/bin/python
#coding=utf-8
import requests
from bs4 import BeautifulSoup
import random
import lxml
import re
import threading
from queue import Queue
from threading import Lock
import time
userlist= [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# 全局变量,用于保存所有代理IP,这里使用队列,而不使用列表,是因为队列具有线程安全,是原子性操作,不用锁同步
q= Queue()
# 全局变量用于保存,所有线程共测试代理的个数
count = 0
# 全局锁
mylock = Lock()
# 队列,保存验证OK的代理IP
q_ok= Queue()
# 用户保存线程列表,后面创建所有线程后,统一join(),不能在创建线程时join()这样还是一个线程一个线程执行,不能并发多线程,
mythread_list=[]
def get_headers():
headers={
'User-Agent':random.choice(userlist), # 随机更换请求头
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch'
}
return headers
# 获取指定页码上的代理IP,和端口号,并保存在全局列表中
def run(begin,end):
global ip_list
url="http://www.xicidaili.com/wt/"
for page in range(begin,end):
print (url+str(page))
try:
ret=requests.get(
url=url+str(page),headers=get_headers()
)
ret.raise_for_status() # 如果状态不是200,则抛出异常。
ret.encoding=ret.apparent_encoding # 指定编码等于原始页面编码
print (type(ret))
#print ret.text
text=BeautifulSoup(ret.text,'lxml') # 使用BeautifulSoup解析
tr_list=text.find_all(name='tr')[1:] # 获取所有的tr,从第1个开始,第0个不要,因为是表头 ,返回的是列表
for tr in tr_list:
td_list=tr.find_all(name='td')[1:] #也是从下标从1开始的,因为0下标是,国家图标
ip= td_list[0].text+":"+td_list[1].text #拼接ip和端口
q.put(ip) # 添加进IP队列中。
# print ip_list
except Exception as e:
print (e)
# 检查IP的可用性 http://httpbin.org/ip 或者 http://2018.ip138.com/ic.asp
def check_ip(ip,time_out,test_url="http://2019.ip138.com/ic.asp"):
proxies={'http': ip}
ip_s=ip.split(":")[0] # 截取出代理IP,用于后面比对
try:
ret=requests.get(url=test_url,headers=get_headers(),proxies=proxies,timeout=time_out)
if ret.status_code==200:
ret.encoding=ret.apparent_encoding
ret_ip=re.findall('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}',ret.text)[0]
if ret_ip==ip_s:
print ("线程:{0} 验证 {1} 可用:".format(threading.current_thread().name,ip))
q_ok.put(ip)
except Exception as e:
print (e)
# 创建一个线程类,本线程类功能:将从西刺代理网爬取下来的IP+端口列表,拿到代理IP检测网址中去验证代理是否可用
class Mythread(threading.Thread):
def __init__(self,name,args):
threading.Thread.__init__(self,name=name,args=args)
def run(self):
# 遍历IP:端口 列表,测试可用性。
global count
while q.qsize():
mylock.acquire() # 加锁
count = count + 1
mylock.release() # 解锁
value=q.get() #从队列中取一个IP+端口
check_ip(ip=value, time_out=5)
if count % 50 == 0:
print("已经检查了代理IP{0}个,总共{1}个:".format(count,qsize)) # 每检查100个代理IP,就打印出来检查进度
#jiexi=BeautifulSoup()
if __name__ == '__main__':
begin_time=time.time()
# 爬取第几页到第几页码的IP:端口
run(1, 8)
qsize=q.qsize()
print('now get {0} counts ip to used'.format(qsize))
for i in range(50): # 开启50个线程
print('启动第{0}号线程'.format(i))
mythread=Mythread('{0}号线程'.format(i),())
mythread.start()
mythread_list.append(mythread)
# 主线程等待所有子线程
for i in mythread_list:
i.join()
end_time = time.time()
print("检查可用的代理IP如下:共{0}个,耗时{1} 秒".format(q_ok.qsize(),end_time-begin_time))
while not q_ok.empty():
print(q_ok.get())
print("所有子线程执行完成,程序退出!")