在用hive做nginx日志url 分析的时候,经常需要parse url。
hive中自带的函数parse_url可以实现这个功能,不过它对格式的要求比较严格,不能直接用于nginx log的request字段。
1
2
|
hive -e
"select parse_url('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST') from dual"
facebook.com
|
1
2
|
hive -e
"select parse_url('facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'HOST') from dual"
NULL
|
也可以通过regexp_extract来实现,不过需要写正则,同时性能也有些问题。。
1
2
|
hive -e
"select regexp_extract('GET /vips-mobile/router.do?api_key=24415b921531551cb2ba756b885ce783&app_version=1.8.6&fields=sku_id HTTP/1.1','.+? +(.+?)app_version=(.+?)&(.+) .+?',2) from dual"
1.8.6
|
考虑自己写一个,借鉴parse_url的udf:
代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
package
com.hive.myudf;
import
java.net.URL;
import
java.util.regex.Matcher;
import
java.util.regex.Pattern;
import
org.apache.hadoop.hive.ql.exec.UDF;
public
class
UDFNginxParseUrl
extends
UDF {
private
String schemal =
"http://"
;
private
String host1 =
null
;
private
Pattern p1 =
null
;
private
URL url =
null
;
private
Pattern p =
null
;
private
String lastKey =
null
;
public
UDFNginxParseUrl() {
}
public
String evaluate(String host1, String urlStr, String partToExtract) {
if
(host1 ==
null
|| urlStr ==
null
|| partToExtract ==
null
) {
return
null
;
}
p1 = Pattern.compile(
"(.+?) +(.+?) (.+)"
);
Matcher m1 = p1.matcher(urlStr);
if
(m1.matches()){
String realUrl = schemal + host1 + m1.group(
2
);
System.out.println(
"URL is "
+ realUrl);
try
{
url =
new
URL(realUrl);
}
catch
(Exception e){
return
null
;
}
}
/*
if (lastUrlStr == null || !urlStr.equals(lastUrlStr)) {
try {
url = new URL(urlStr);
} catch (Exception e) {
return null;
}
}
lastUrlStr = urlStr;
*/
if
(partToExtract.equals(
"HOST"
)) {
return
url.getHost();
}
if
(partToExtract.equals(
"PATH"
)) {
return
url.getPath();
}
if
(partToExtract.equals(
"QUERY"
)) {
return
url.getQuery();
}
if
(partToExtract.equals(
"REF"
)) {
return
url.getRef();
}
if
(partToExtract.equals(
"PROTOCOL"
)) {
return
url.getProtocol();
}
if
(partToExtract.equals(
"FILE"
)) {
return
url.getFile();
}
if
(partToExtract.equals(
"AUTHORITY"
)) {
return
url.getAuthority();
}
if
(partToExtract.equals(
"USERINFO"
)) {
return
url.getUserInfo();
}
return
null
;
}
public
String evaluate(String host, String urlStr, String partToExtract, String key) {
if
(!partToExtract.equals(
"QUERY"
)) {
return
null
;
}
String query =
this
.evaluate(host, urlStr, partToExtract);
if
(query ==
null
) {
return
null
;
}
if
(!key.equals(lastKey)) {
p = Pattern.compile(
"(&|^)"
+ key +
"=([^&]*)"
);
}
lastKey = key;
Matcher m = p.matcher(query);
if
(m.find()) {
return
m.group(
2
);
}
return
null
;
}
}
|
add jar和create function之后测试:
1
2
|
hive -e
"select nginx_url_parse('test.test.com','GET /vips-mobile/router.do?api_key=24415&app_version=1.8.6&fields=sku_id HTTP/1.1','HOST') FROM dual;"
test.test.com
|
1
2
|
hive -e
"select nginx_url_parse('test.test.com','GET /vips-mobile/router.do?api_key=24415&app_version=1.8.6&fields=sku_id HTTP/1.1','QUERY','api_key') FROM dual;"
24415
|
这样就可以直接应用于nginx的日志了。
本文转自菜菜光 51CTO博客,原文链接:http://blog.51cto.com/caiguangguang/1350463,如需转载请自行联系原作者