背景:利用爬虫,爬取网站页面广告元素,监控爬取元素的数目,定时发送监控邮件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
'''
@xiayun
@896365105@qq.com
#爬取网站内容,利用phantomjs:IP代理+修改UA+动态页面执行JS
'''
from
selenium
import
webdriver
from
selenium.webdriver.common.desired_capabilities
import
DesiredCapabilities
import
time
import
urllib,urllib2
import
smtplib
import
re
from
email.mime.text
import
MIMEText
from
email.header
import
Header
import
sys
def
reptile():
global
result, data
#proxy_ip.txt为IP代理池,可以自己爬IP,也可以买,不过都不稳定,
#需要在前面再加一个IP验证程序。
IPS
=
[i
for
i
in
open
(
"./proxy_ip.txt"
,
'r'
).readline().split(
'\n'
)
if
i]
print
IPS
for
i
in
IPS:
service_args
=
[]
service_args
=
[
'--proxy-type=HTTP'
,]
IP_str
=
''.join(i)
print
IP_str
proxy_IP
=
'--proxy=%s'
%
IP_str
service_args.append(proxy_IP)
dcap
=
dict
(DesiredCapabilities.PHANTOMJS)
#创建UA头
dcap[
"phantomjs.page.settings.userAgent"
]
=
('Mozilla
/
5.0
(baomihua@iPhone;
CPU iPhone OS
9_1
like Mac OS X) AppleWebKit
/
601.1
.
46
(KHTML, like Gecko)
Version
/
9.0
Mobile
/
13B143
Safari
/
601.1
')
#利用phantomjs仿浏览器动作,参数2是代理IP
driver
=
webdriver.PhantomJS(desired_capabilities
=
dcap, service_args
=
service_args)
#设置访问超时时间
driver.implicitly_wait(
60
)
driver.set_page_load_timeout(
60
)
try
:
driver.get(
'网页地址'
)
except
:
print
"timeout"
finally
:
data
=
driver.page_source
time.sleep(
20
)
req
=
r
"广告元素"
rule1
=
re.
compile
(req)
lists
=
re.findall(rule1, data)
counts
=
len
(lists)
print
counts
# print data
driver.quit()
#判断广告元素是否为22
if
counts
=
=
22
:
print
"The webpage is OK!"
result
=
"The webpage
is
OK!Find
22
广告元素!
proxy_IP:
%
s "
%
IP_str
break
if
counts !
=
22
:
#IPS.remove(i)
print
"%s is bad!"
%
i.strip()
result
=
"The webpage maybe bad"
print
"close"
#返回结果和网页代码
return
result, data
def
send_mail(result,data):
receivers
=
[
'XXX@XX.com'
]
#接收人
mail_host
=
'smtp.exmail.qq.com'
#代理邮箱smtp协议
mail_user
=
'xxx@xxx.com'
#发送人
mail_pass
=
'xxxx'
#密码
mail_postfix
=
'xxxx'
#发件箱的后缀
title
=
str
(result)
msg
=
MIMEText(data,
'plain'
,
'utf-8'
)
#文本格式内容
me
=
title.decode(
'utf-8'
)
+
"<"
+
mail_user
+
">"
msg[
'Subject'
]
=
Header(title,
'utf-8'
)
msg[
'From'
]
=
Header(me,
'utf-8'
)
msg[
'To'
]
=
Header(
";"
.join(receivers),
'utf-8'
)
try
:
s
=
smtplib.SMTP()
s.connect(mail_host)
s.login(mail_user, mail_pass)
s.sendmail(me,receivers , msg.as_string())
s.close()
print
"发送成功"
return
True
except
smtplib.SMTPException:
print
"Error: 无法发送邮件"
return
False
if
__name__
=
=
'__main__'
:
while
1
:
print
'start'
+
' '
+
''.join(time.ctime(time.time()))
result, data
=
reptile()
send_mail(result
=
result, data
=
data)
print
'stop'
+
' '
+
''.join(time.ctime(time.time()))
time.sleep(
600
)
sys.exit(
0
)
|
本文转自YU文武貝 51CTO博客,原文链接:http://blog.51cto.com/linuxerxy/1893893
,如需转载请自行联系原作者