ETL
在运行核心业务MapReduce程序时,往往要进行数据清洗,清理掉不符合要求的数据。清理的过程往往只需要运行Mapper程序,不需要运行Reducer程序。
需求
去除日志中字段个数<=11的日志(用空格分隔字段)
输入数据
194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] “GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1” 304 0 “-” “Mozilla/4.0 (compatible;)” 183.49.46.228 - - [18/Sep/2013:06:49:23 +0000] “-” 400 0 “-” “-” 163.177.71.12 - - [18/Sep/2013:06:49:33 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 163.177.71.12 - - [18/Sep/2013:06:49:36 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 101.226.68.137 - - [18/Sep/2013:06:49:42 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 101.226.68.137 - - [18/Sep/2013:06:49:45 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 60.208.6.156 - - [18/Sep/2013:06:49:48 +0000] “GET /wp-content/uploads/2013/07/rcassandra.png HTTP/1.0” 200 185524 “http://cos.name/category/software/packages/” “Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36” 222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] “GET /images/my.jpg HTTP/1.1” 200 19939 “http://www.angularjs.cn/A00n” “Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36” 222.68.172.190 - - [18/Sep/2013:06:50:08 +0000] “-” 400 0 “-” “-” 183.195.232.138 - - [18/Sep/2013:06:50:16 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 183.195.232.138 - - [18/Sep/2013:06:50:16 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 66.249.66.84 - - [18/Sep/2013:06:50:28 +0000] “GET /page/6/ HTTP/1.1” 200 27777 “-” “Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)” 221.130.41.168 - - [18/Sep/2013:06:50:37 +0000] “GET /feed/ HTTP/1.1” 304 0 “-” “Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36” 157.55.35.40 - - [18/Sep/2013:06:51:13 +0000] “GET /robots.txt HTTP/1.1” 200 150 “-” “Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)” 50.116.27.194 - - [18/Sep/2013:06:51:35 +0000] “POST /wp-cron.php?doing_wp_cron=1379487095.2510800361633300781250 HTTP/1.0” 200 0 “-” “WordPress/3.6; http://blog.fens.me” 58.215.204.118 - - [18/Sep/2013:06:51:35 +0000] “GET /nodejs-socketio-chat/ HTTP/1.1” 200 10818 “http://www.google.com/url?sa=t&rct=j&q=nodejs%20%E5%BC%82%E6%AD%A5%E5%B9%BF%E6%92%AD&source=web&cd=1&cad=rja&ved=0CCgQFjAA&url=%68%74%74%70%3a%2f%2f%62%6c%6f%67%2e%66%65%6e%73%2e%6d%65%2f%6e%6f%64%65%6a%73%2d%73%6f%63%6b%65%74%69%6f%2d%63%68%61%74%2f&ei=rko5UrylAefOiAe7_IGQBw&usg=AFQjCNG6YWoZsJ_bSj8kTnMHcH51hYQkAA&bvm=bv.52288139,d.aGc” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 58.215.204.118 - - [18/Sep/2013:06:51:36 +0000] “GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1” 304 0 “http://blog.fens.me/nodejs-socketio-chat/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 58.215.204.118 - - [18/Sep/2013:06:51:35 +0000] “GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1” 304 0 “http://blog.fens.me/nodejs-socketio-chat/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 58.215.204.118 - - [18/Sep/2013:06:51:36 +0000] “GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1” 304 0 “http://blog.fens.me/nodejs-socketio-chat/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 58.215.204.118 - - [18/Sep/2013:06:51:36 +0000] “GET /wp-content/uploads/2013/08/chat.png HTTP/1.1” 200 48968 “http://blog.fens.me/nodejs-socketio-chat/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 58.215.204.118 - - [18/Sep/2013:06:51:36 +0000] “GET /wp-content/uploads/2013/08/chat2.png HTTP/1.1” 200 59852 “http://blog.fens.me/nodejs-socketio-chat/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 58.215.204.118 - - [18/Sep/2013:06:51:37 +0000] “GET /wp-content/uploads/2013/08/socketio.png HTTP/1.1” 200 80493 “http://blog.fens.me/nodejs-socketio-chat/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 58.248.178.212 - - [18/Sep/2013:06:51:37 +0000] “GET /nodejs-grunt-intro/ HTTP/1.1” 200 51770 “http://blog.fens.me/series-nodejs/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)” 58.248.178.212 - - [18/Sep/2013:06:51:40 +0000] “GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1” 200 7200 “http://blog.fens.me/nodejs-grunt-intro/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)” 58.248.178.212 - - [18/Sep/2013:06:51:40 +0000] “GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1” 200 786 “http://blog.fens.me/nodejs-grunt-intro/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)” 58.248.178.212 - - [18/Sep/2013:06:51:40 +0000] “GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1” 200 45307 “http://blog.fens.me/nodejs-grunt-intro/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)” 58.248.178.212 - - [18/Sep/2013:06:51:40 +0000] “GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1” 200 93128 “http://blog.fens.me/nodejs-grunt-intro/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)” 58.248.178.212 - - [18/Sep/2013:06:51:40 +0000] “GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1” 200 786 “http://blog.fens.me/nodejs-grunt-intro/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)” 58.215.204.118 - - [18/Sep/2013:06:51:41 +0000] “-” 400 0 “-” “-” 58.215.204.118 - - [18/Sep/2013:06:51:41 +0000] “-” 400 0 “-” “-” 58.215.204.118 - - [18/Sep/2013:06:51:41 +0000] “-” 400 0 “-” “-” 157.55.35.40 - - [18/Sep/2013:06:51:43 +0000] “GET /rhadoop-java-basic/ HTTP/1.1” 200 26780 “-” “Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)” 58.248.178.212 - - [18/Sep/2013:06:51:43 +0000] “GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1” 200 7200 “http://blog.fens.me/nodejs-grunt-intro/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)” 58.248.178.212 - - [18/Sep/2013:06:51:45 +0000] “GET /wp-content/uploads/2013/08/grunt.png HTTP/1.1” 200 199040 “http://blog.fens.me/nodejs-grunt-intro/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)” 58.215.204.118 - - [18/Sep/2013:06:52:26 +0000] “GET /nodejs-async/ HTTP/1.1” 200 12647 “http://blog.fens.me/nodejs-socketio-chat/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 58.215.204.118 - - [18/Sep/2013:06:52:27 +0000] “GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1” 304 0 “http://blog.fens.me/nodejs-async/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 58.215.204.118 - - [18/Sep/2013:06:52:27 +0000] “GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1” 304 0 “http://blog.fens.me/nodejs-async/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 58.215.204.118 - - [18/Sep/2013:06:52:27 +0000] “GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1” 304 0 “http://blog.fens.me/nodejs-async/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 163.177.71.12 - - [18/Sep/2013:06:52:29 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 58.215.204.118 - - [18/Sep/2013:06:52:29 +0000] “GET /nodejs-async/?cf_action=sync_comments&post_id=2357 HTTP/1.1” 200 48 “http://blog.fens.me/nodejs-async/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 163.177.71.12 - - [18/Sep/2013:06:52:32 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 58.215.204.118 - - [18/Sep/2013:06:52:32 +0000] “-” 400 0 “-” “-” 58.215.204.118 - - [18/Sep/2013:06:52:33 +0000] “-” 400 0 “-” “-” 58.215.204.118 - - [18/Sep/2013:06:52:33 +0000] “-” 400 0 “-” “-” 101.226.68.137 - - [18/Sep/2013:06:52:36 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 101.226.68.137 - - [18/Sep/2013:06:52:39 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 183.195.232.138 - - [18/Sep/2013:06:53:12 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 183.195.232.138 - - [18/Sep/2013:06:53:12 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 222.66.59.174 - - [18/Sep/2013:06:53:30 +0000] “GET /images/my.jpg HTTP/1.1” 200 19939 “http://www.angularjs.cn/A00n” “Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0” 222.66.59.174 - - [18/Sep/2013:06:53:36 +0000] “-” 400 0 “-” “-” 216.24.201.254 - - [18/Sep/2013:06:53:57 +0000] “GET /feed/ HTTP/1.1” 200 33867 “-” “Mozilla/5.0 (Ubuntu; X11; Linux x86_64; rv:8.0) Gecko/20100101 Firefox/8.0” 58.215.204.118 - - [18/Sep/2013:06:54:48 +0000] “GET /r-json-rjson/ HTTP/1.1” 200 11500 “http://blog.fens.me/nodejs-async/” “Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0” 61.135.216.105 - - [18/Sep/2013:06:54:51 +0000] “GET /comments/feed/ HTTP/1.1” 304 0 “-” “Mozilla/5.0 (compatible;YoudaoFeedFetcher/1.0;http://www.youdao.com/help/reader/faq/topic006/;1 subscribers;)” 222.66.59.174 - - [18/Sep/2013:06:55:19 +0000] “-” 400 0 “-” “-” 163.177.71.12 - - [18/Sep/2013:06:55:24 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 163.177.71.12 - - [18/Sep/2013:06:55:27 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 101.226.68.137 - - [18/Sep/2013:06:55:30 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 101.226.68.137 - - [18/Sep/2013:06:55:33 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 222.66.59.174 - - [18/Sep/2013:06:55:51 +0000] “GET /images/my.jpg HTTP/1.1” 200 19939 “http://www.angularjs.cn/” “Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)” 183.195.232.138 - - [18/Sep/2013:06:56:07 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 183.195.232.138 - - [18/Sep/2013:06:56:07 +0000] “HEAD / HTTP/1.1” 200 20 “-” “DNSPod-Monitor/1.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /vps-ip-dns/ HTTP/1.1” 200 11403 “http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=6&cad=rja&ved=0CHIQFjAF&url=http%3A%2F%2Fblog.fens.me%2Fvps-ip-dns%2F&ei=j045UrP5AYX22AXsg4G4DQ&usg=AFQjCNGsJfLMNZnwWXNpTSUl6SOEzfF6tg&sig2=YY1oxEybUL7wx3IrVIMfHA&bvm=bv.52288139,d.b2I” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/style.css HTTP/1.1” 200 7554 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1” 200 32851 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1” 200 7200 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1” 200 786 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/js/jquery.cycle.all.min.js HTTP/1.1” 200 7784 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/js/load.js HTTP/1.1” 200 715 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/functions/css/shortcodes.css HTTP/1.1” 200 2899 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/functions/js/shortcode.js HTTP/1.1” 200 333 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/uploads/2013/06/%E5%8A%A8%E6%80%81IP%E8%A7%A3%E6%9E%90.jpg HTTP/1.1” 200 36779 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/uploads/2013/06/wtmart.png HTTP/1.1” 200 26105 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/uploads/2013/06/linux-dns.png HTTP/1.1” 200 14841 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/uploads/2013/06/win-dns.png HTTP/1.1” 200 30694 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/uploads/2013/06/win-ssh.png HTTP/1.1” 200 17524 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-includes/images/smilies/icon_smile.gif HTTP/1.1” 200 174 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/images/natty-logo.png HTTP/1.1” 200 1438 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/uploads/2013/05/favicon.ico HTTP/1.1” 200 1150 “-” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /js/baidu.js HTTP/1.1” 200 249 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /js/google.js HTTP/1.1” 200 475 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/images/slide-bg.png HTTP/1.1” 200 934 “http://blog.fens.me/wp-content/themes/silesia/style.css” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/images/sprites/post-type.png HTTP/1.1” 200 2009 “http://blog.fens.me/wp-content/themes/silesia/style.css” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:41 +0000] “GET /wp-content/themes/silesia/images/ico-twitter.png HTTP/1.1” 200 2128 “http://blog.fens.me/wp-content/themes/silesia/style.css” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:41 +0000] “GET /wp-content/themes/silesia/images/ico-meta.gif HTTP/1.1” 200 73 “http://blog.fens.me/wp-content/themes/silesia/style.css” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/images/crubms-div.png HTTP/1.1” 200 1255 “http://blog.fens.me/wp-content/themes/silesia/style.css” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/images/bullets/5.gif HTTP/1.1” 200 62 “http://blog.fens.me/wp-content/themes/silesia/style.css” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] “GET /wp-content/themes/silesia/images/home-ico.png HTTP/1.1” 200 1103 “http://blog.fens.me/wp-content/themes/silesia/style.css” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:42 +0000] “GET /vps-ip-dns/?cf_action=sync_comments&post_id=499 HTTP/1.1” 200 48 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 71.96.108.116 - - [18/Sep/2013:06:56:46 +0000] “-” 400 0 “-” “-” 71.96.108.116 - - [18/Sep/2013:06:56:46 +0000] “-” 400 0 “-” “-” 71.96.108.116 - - [18/Sep/2013:06:57:34 +0000] “GET /dataguru-beijing-meeting-20130616/ HTTP/1.1” 200 8423 “http://blog.fens.me/vps-ip-dns/” “Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0” 66.249.66.84 - - [18/Sep/2013:06:57:35 +0000] “GET /series-angular/?cf_action=sync_comments&post_id=1991 HTTP/1.1” 200 48 “-” “Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)” 124.42.13.230 - - [18/Sep/2013:06:57:48 +0000] “GET /mongodb-replica-set/ HTTP/1.1” 200 50131 “http://www.baidu.com/s?tn=baiduhome_pg&ie=utf-8&bs=%E5%9C%A8linux%E5%90%AF%E5%8A%A8%E4%B8%8Bmongodb.conf&f=8&rsv_bp=1&wd=about+to+fork+child+process%2C+waiting+until+server+is+ready+for+connections.&rsv_n=2&rsv_sug3=1&rsv_sug1=1&rsv_sug4=187&inputT=906” “Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:48 +0000] “GET /wp-content/themes/silesia/style.css HTTP/1.1” 200 34090 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:50 +0000] “GET /wp-content/themes/silesia/js/load.js HTTP/1.1” 200 715 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:50 +0000] “GET /wp-content/themes/silesia/functions/js/shortcode.js HTTP/1.1” 200 333 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:50 +0000] “GET /wp-content/themes/silesia/functions/css/shortcodes.css HTTP/1.1” 200 2899 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:50 +0000] “GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1” 200 7200 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:50 +0000] “GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1” 200 786 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:51 +0000] “GET /wp-content/themes/silesia/js/jquery.cycle.all.min.js HTTP/1.1” 200 31539 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:51 +0000] “GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1” 200 93128 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:53 +0000] “GET /wp-content/themes/silesia/images/slide-bg.png HTTP/1.1” 200 934 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:53 +0000] “GET /js/google.js HTTP/1.1” 200 475 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:53 +0000] “GET /js/baidu.js HTTP/1.1” 200 249 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:53 +0000] “GET /wp-content/uploads/2013/05/rs.png HTTP/1.1” 200 35768 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:53 +0000] “GET /wp-content/themes/silesia/images/ico-twitter.png HTTP/1.1” 200 2128 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:53 +0000] “GET /wp-content/themes/silesia/images/crubms-div.png HTTP/1.1” 200 1255 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:53 +0000] “GET /wp-content/themes/silesia/images/home-ico.png HTTP/1.1” 200 1103 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)” 124.42.13.230 - - [18/Sep/2013:06:57:53 +0000] “GET /wp-content/themes/silesia/images/sprites/post-type.png HTTP/1.1” 200 2009 “http://blog.fens.me/mongodb-replica-set/” “Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; BTRS101170; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)”
WebLogMapper
public class WebLogMapper extends Mapper<LongWritable, Text,Text, NullWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //1.获取一行数据 String line = value.toString(); //2.切割 String[] words = StringUtils.split(line, ' '); //3.etl if (words.length >= 11){ context.write(value,NullWritable.get()); } } }
WebLogDriver
public class WebLogDriver { public static void main(String[] args) throws Exception { // 输入输出路径需要根据自己电脑上实际的输入输出路径设置 args = new String[] { "D:\\MapReduce_Data_Test\\etl\\input", "D:\\MapReduce_Data_Test\\etl\\output" }; Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(WebLogDriver.class); job.setMapperClass(WebLogMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); // 不需要reduceTask 设置reduceTask个数为0 job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean b = job.waitForCompletion(true); System.exit(b ? 0 : 1); } }