使用Python 分析Nginx access 日志,根据Nginx日志格式进行分割并存入MySQL数据库。(参考网上一些文章)
Nginx access日志格式如下:
#使用的nginx默认日志格式$remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" "$http_x_forwarded_for"'
Nginx access 日志内容如下:
182.19.31.129 - - [2013-08-13T00:00:01-07:00] "GET /css/anniversary.css HTTP/1.1" 304 0 "http://www.chlinux.net/" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36" "-"
下面是Python 分析nginx的Python代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
|
#!/usr/bin/env python
#coding:utf8
import
os
import
fileinput
import
re
import
sys
import
MySQLdb
#日志的位置
logfile
=
open
(
"access_20130812.log"
)
#使用的nginx默认日志格式$remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" "$http_x_forwarded_for"'
#日志分析正则表达式
#203.208.60.230
ipP
=
r
"?P<ip>[\d.]*"
#以[开始,除[]以外的任意字符 防止匹配上下个[]项目(也可以使用非贪婪匹配*?) 不在中括号里的.可以匹配换行外的任意字符 *这样地重复是"贪婪的“ 表达式引擎会试着重复尽可能多的次数。#以]结束
#[21/Jan/2011:15:04:41 +0800]
timeP
=
r
"""?P<time>\[[^\[\]]*\]"""
#以"开始, #除双引号以外的任意字符 防止匹配上下个""项目(也可以使用非贪婪匹配*?),#以"结束
#"GET /EntpShop.do?method=view&shop_id=391796 HTTP/1.1"
#"GET /EntpShop.do?method=view&shop_id=391796 HTTP/1.1"
requestP
=
r
"""?P<request>\"[^\"]*\"""
"
statusP
=
r
"?P<status>\d+"
bodyBytesSentP
=
r
"?P<bodyByteSent>\d+"
#以"开始, 除双引号以外的任意字符 防止匹配上下个""项目(也可以使用非贪婪匹配*?),#以"结束
#"http://test.myweb.com/myAction.do?method=view&mod_id=&id=1346"
referP
=
r
"""?P<refer>\"[^\"]*\"""
"
#以"开始, 除双引号以外的任意字符 防止匹配上下个""项目(也可以使用非贪婪匹配*?),以"结束
#"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"'
userAgentP
=
r
"""?P<userAgent>\"[^\"]*\"""
"
#以(开始, 除双引号以外的任意字符 防止匹配上下个()项目(也可以使用非贪婪匹配*?),以"结束
#(compatible; Googlebot/2.1; +http://www.google.com/bot.html)"'
userSystems
=
re.
compile
(r
'\([^\(\)]*\)'
)
#以"开始,除双引号以外的任意字符防止匹配上下个""项目(也可以使用非贪婪匹配*?),以"结束
userlius
=
re.
compile
(r
'[^\)]*\"'
)
#原理:主要通过空格和-来区分各不同项目,各项目内部写各自的匹配表达式
nginxLogPattern
=
re.
compile
(r
"(%s)\ -\ -\ (%s)\ (%s)\ (%s)\ (%s)\ (%s)\ (%s)"
%
(ipP, timeP, requestP, statusP, bodyBytesSentP, referP, userAgentP), re.VERBOSE)
#数据库连接信息
conn
=
MySQLdb.connect(host
=
'192.168.1.22'
,user
=
'test'
,passwd
=
'pass'
,port
=
3306
,db
=
'python'
)
cur
=
conn.cursor()
sql
=
"INSERT INTO python.test VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
while
True
:
line
=
logfile.readline()
if
not
line:
break
matchs
=
nginxLogPattern.match(line)
if
matchs !
=
None
:
allGroup
=
matchs.groups()
ip
=
allGroup[
0
]
time
=
allGroup[
1
]
request
=
allGroup[
2
]
status
=
allGroup[
3
]
bodyBytesSent
=
allGroup[
4
]
refer
=
allGroup[
5
]
userAgent
=
allGroup[
6
]
Time
=
time.replace(
'T'
,
' '
)[
1
:
-
7
]
if
len
(userAgent) >
20
:
userinfo
=
userAgent.split(
' '
)
userkel
=
userinfo[
0
]
try
:
usersystem
=
userSystems.findall(userAgent)
usersystem
=
usersystem[
0
]
print
usersystem
userliu
=
userlius.findall(userAgent)
value
=
[ip,Time,request,status,bodyBytesSent,refer,userkel,usersystem,userliu[
1
]]
conn.commit()
print
value
except
IndexError:
userinfo
=
userAgent
value
=
[ip,Time,request,status,bodyBytesSent,refer,userinfo,"
","
"]
else
:
useraa
=
userAgent
value
=
[ip,Time,request,status,bodyBytesSent,refer,useraa,"
","
"]
try
:
result
=
cur.execute(sql,value)
#conn.commit()
print
result
except
MySQLdb.Error,e:
print
"Mysql Error %d: %s"
%
(e.args[
0
], e.args[
1
])
conn.commit()
conn.close()
|
存入数据库后数据是如下格式:
本文转自1594cqb 51CTO博客,原文链接:http://blog.51cto.com/wolfchen/1374470,如需转载请自行联系原作者