假如有一个web系统,每天生成以下日志文件:
2020年12月21日数据 192.228.33.6,hunter,2020-12-21 10:30:20,/a 192.228.33.7,hunter,2020-12-21 10:30:26,/b 192.228.33.6,jack,2020-12-21 10:30:27,/a 192.228.33.8,tom,2020-12-21 10:30:28,/b 192.228.33.9,rose,2020-12-21 10:30:30,/b 192.228.33.10,julia,2020-12-21 10:30:40,/c 2020年12月22日数据 192.228.33.22,hunter,2020-12-22 10:30:20,/a 192.228.33.18,jerry,2020-12-22 10:30:30,/b 192.228.33.26,jack,2020-12-22 10:30:40,/a 192.228.33.18,polo,2020-12-22 10:30:50,/b 192.228.33.39,nissan,2020-12-22 10:30:53,/b 192.228.33.39,nissan,2020-12-22 10:30:55,/a 192.228.33.39,nissan,2020-12-22 10:30:58,/c 192.228.33.20,ford,2020-12-22 10:30:54,/c 2020年12月23日数据 192.228.33.46,hunter,2020-12-23 10:30:21,/a 192.228.43.18,jerry,2020-12-23 10:30:22,/b 192.228.43.26,tom,2020-12-23 10:30:23,/a 192.228.53.18,bmw,2020-12-23 10:30:24,/b 192.228.63.39,benz,2020-12-23 10:30:25,/b 192.228.33.25,haval,2020-12-23 10:30:30,/c 192.228.33.10,julia,2020-12-23 10:30:40,/c
需求
1.建立一个表,来存储每天新增的数据(分区表)
2.统计每天的活跃用户(日活)(需要用户的ip,用户的账号,用户访问时间最早的一条url和时间)
3.统计每天的新增用户(日新)
实现
建表(存储数据的分区表)
create table t_web_log(ip string,uid string,access_time string,url string) partitioned by (day string) row format delimited fields terminated by ',';
导数据
load data local inpath '/root/hivetest/log.21' into table t_web_log partition (day='2020-12-21'); load data local inpath '/root/hivetest/log.22' into table t_web_log partition (day='2020-12-22'); load data local inpath '/root/hivetest/log.23' into table t_web_log partition (day='2020-12-23');
show partitions t_web_log;(查看分区表)
指标统计
每日活跃用户统计
1.建一个保存日活数据的表
create table t_user_active_day(ip string,uid string,first_access string,url string) partitioned by (day string);
2.从日志表中查出日活数据并插入日活数据表
插入21日数据 insert into table t_user_active_day partition(day='2020-12-21') select ip,uid,access_time,url from ( select ip,uid,access_time,url, row_number() over(partition by uid order by access_time) as rn from t_web_log where day='2020-12-21') tmp where rn=1; 插入22日数据 insert into table t_user_active_day partition(day='2020-12-22') select ip,uid,access_time,url from ( select ip,uid,access_time,url, row_number() over(partition by uid order by access_time) as rn from t_web_log where day='2020-12-22') tmp where rn=1; 插入23日数据 insert into table t_user_active_day partition(day='2020-12-23') select ip,uid,access_time,url from ( select ip,uid,access_time,url, row_number() over(partition by uid order by access_time) as rn from t_web_log where day='2020-12-23') tmp where rn=1;
每日新用户统计
思路——将当日活跃用户跟历史用户表关联,找出那些在历史用户表中尚不存在的用户
1.建历史用户表
create table t_user_history(uid string);
2.建一个存放新用户的表
create table t_user_new_day like t_user_active_day;
3.求出每日的新用户并把数据插入新用户表
21日新用户 insert into table t_user_new_day partition (day='2020-12-21') select ip,uid,first_access,url from ( select a.ip,a.uid,a.first_access,a.url,b.uid as b_uid from t_user_active_day a left join t_user_history b on a.uid=b.uid where a.day='2020-12-21') tmp where tmp.b_uid is null; 22日新用户 insert into table t_user_new_day partition (day='2020-12-22') select ip,uid,first_access,url from ( select a.ip,a.uid,a.first_access,a.url,b.uid as b_uid from t_user_active_day a left join t_user_history b on a.uid=b.uid where a.day='2020-12-22') tmp where tmp.b_uid is null; 23日新用户 insert into table t_user_new_day partition (day='2020-12-23') select ip,uid,first_access,url from ( select a.ip,a.uid,a.first_access,a.url,b.uid as b_uid from t_user_active_day a left join t_user_history b on a.uid=b.uid where a.day='2020-12-23') tmp where tmp.b_uid is null;
4.将每日的新用户插入历史表
21日数据 insert into table t_user_history select uid from t_user_new_day where day='2020-12-21'; 22日数据 insert into table t_user_history select uid from t_user_new_day where day='2020-12-22'; 23日数据 insert into table t_user_history select uid from t_user_new_day where day='2020-12-23';
加餐
清空表
truncate table t_user_active_day;
修改Linux系统时间
date -s '2020-12-20 00:20:00'
统计日新、日活的shell脚本
vi user_etl.sh
#!/bin/bash day_str=`date -d '-1 day'+'%Y-%m-%d'` echo "准备处理$day_str的数据......" hive_exec=/root/apps/hive-1.2.1/bin/hive HQL_user_active_day=" insert into table exercise.t_user_active_day partition(day=\"$day_str\") select ip,uid,access_time,url from ( select ip,uid,access_time,url, row_number() over(partition by uid order by access_time) as rn from t_web_log where day=\"$day_str\") tmp where rn=1; " $hive_exec "$HQL_user_active_day" HQL_user_new_day=" insert into table t_user_new_day partition (day=\"$day_str\") select ip,uid,first_access,url from ( select a.ip,a.uid,a.first_access,a.url,b.uid as b_uid from t_user_active_day a left join t_user_history b on a.uid=b.uid where a.day=\"$day_str\") tmp where tmp.b_uid is null; " $hive_exec "$HQL_user_new_day" HQL_new_to_history=" insert into table t_user_history select uid from t_user_new_day where day=\"$day_str\; " $hive_exec "$HQL_new_to_history"
运行:sh user_etl.sh