hive函数使用
小技巧:测试函数的用法,可以专门准备一个专门的dual表
create table dual(x string);
insert into table dual values('');
其实:直接用常量来测试函数即可
select substr("abcdefg",1,3);
substr,在数据库中脚标是从1开始;
hive的所有函数手册:
常用内置函数
类型转换函数
select cast("5" as int)
from dual;select cast("2017-08-03" as date) ;
select cast(current_timestamp as date);
- current_timestamp ,hive中的时间戳
- 字符类型的时间,只能是年月日
示例:
1 |
1995-05-05 13:30:59 |
1200.3 |
2 |
1994-04-05 13:30:59 |
2200 |
3 |
1996-06-01 12:20:30 |
80000.5 |
create table t_fun(id string,birthday string,salary string) row format delimited fields terminated by ',';
日期和浮点型转换
select id,to_date(birthday as date) as bir,cast(salary as float) from t_fun;
数学运算函数
select round(5.4) from dual; ## 5
select round(5.1345,3) from dual; ##5.135
select ceil(5.4) from dual; // select ceiling(5.4) from dual; ## 6
select floor(5.4) from dual; ## 5
select abs(-5.4) from dual; ## 5.4
select greatest(3,5,6) from dual; ## 6
select least(3,5,6) from dual;
示例:
有表如下:
select greatest(cast(salary 1 as double),cast(salary 2 as double),cast(salary 3 as double)) from t_fun2;
结果:
+---------+--+ | _c0 | +---------+--+ | 2000.0 | | 9800.0 | +---------+--+
select min(age) from t_person; 聚合函数
select max(age) from t_person; 聚合函数
字符串函数
substr(string, int start) ## 截取子串
substring(string, int start)
示例:select substr("abcdefg",2) from dual;
substr(string, int start, int len)
substring(string, int start, int len)
示例:select substr("abcdefg",2,3) from dual;
concat(string A, string B...) ## 拼接字符串
concat_ws(string SEP, string A, string B...)
示例:select concat("ab","xy") from dual;
select concat_ws(".","192","168","33","44") from dual;
length(string A)
示例:select length("192.168.33.44") from dual;
split(string str, string pat)
示例:select split("192.168.33.44",".") from dual; 错误的,因为.号是正则语法中的特定字符
select split("192.168.33.44","\\.") from dual;
upper(string str) ##转大写
时间函数
select current_timestamp;
select current_date;
## 取当前时间的毫秒数时间戳
select unix_timestamp();
## unix时间戳转字符串
from_unixtime(bigint unixtime[, string format])
示例:select from_unixtime(unix_timestamp());
select from_unixtime(unix_timestamp(),"yyyy/MM/dd HH:mm:ss");
## 字符串转unix时间戳
unix_timestamp(string date, string pattern)
示例: select unix_timestamp("2017-08-10 17:50:30");
select unix_timestamp("2017/08/10 17:50:30","yyyy/MM/dd HH:mm:ss");
## 将字符串转成日期date
select to_date("2017-09-17 16:58:32");
表生成函数
行转列函数:explode()
假如有以下数据:
1,zhangsan,化学:物理:数学:语文 2,lisi,化学:数学:生物:生理:卫生 3,wangwu,化学:语文:英语:体育:生物
映射成一张表:
create table t_stu_subject(id int,name string,subjects array<string>) row format delimited fields terminated by ',' collection items terminated by ':';
使用explode()对数组字段“炸裂”
然后,我们利用这个explode的结果,来求去重的课程:
select distinct tmp.sub from (select explode(subjects) as sub from t_stu_subject) tmp;
表生成函数lateral view
理解: lateral view 相当于两个表在join
左表:是原表
右表:是explode(某个集合字段)之后产生的表
而且:这个join只在同一行的数据间进行
那样,可以方便做更多的查询:
比如,查询选修了生物课的同学
select a.id,a.name,a.sub from (select id,name,tmp.sub as sub from t_stu_subject lateral view explode(subjects) tmp as sub) a where sub='生物';
集合函数
array_contains(Array<T>, value) 返回boolean值
示例:
select moive_name,array_contains(actors,'吴刚') from t_movie; select array_contains(array('a','b','c'),'c') from dual;
sort_array(Array<T>) 返回排序后的数组
示例:
select sort_array(array('c','b','a')) from dual;
select 'haha',sort_array(array('c','b','a')) as xx from (select 0) tmp;
size(Array<T>) 返回一个int值
示例:
select moive_name,size(actors) as actor_number from t_movie;
size(Map<K.V>) 返回一个int值
map_keys(Map<K.V>) 返回一个数组
map_values(Map<K.V>) 返回一个数组
条件控制函数
case when
语法:
CASE [ expression ]
WHEN condition1 THEN result1
WHEN condition2 THEN result2
...
WHEN conditionn THEN resultn
ELSE result
END
示例:
select id,name, case when age<28 then 'youngth' when age>27 and age<40 then 'zhongnian' else 'old' end from t_user;
IF
select id,if(age>25,'working','worked') from t_user; select moive_name,if(array_contains(actors,'吴刚'),'好电影','烂片儿')from t_movie;
json解析函数:表生成函数
json_tuple函数(根据json中的key,就可以获取到对应value)
只能解析简单的JSON,多层或者嵌套JSON需要自定义JSON解析函数。
示例:
select json_tuple(json,'movie','rate','timeStamp','uid') as(movie,rate,ts,uid) from t_rating_json;
产生结果:
利用json_tuple从原始json数据表中,etl出一个详细信息表:
create table t_rate as select uid, movie, rate, year(from_unixtime(cast(ts as bigint))) as year, month(from_unixtime(cast(ts as bigint))) as month, day(from_unixtime(cast(ts as bigint))) as day, hour(from_unixtime(cast(ts as bigint))) as hour, minute(from_unixtime(cast(ts as bigint))) as minute, from_unixtime(cast(ts as bigint)) as ts from (select json_tuple(rateinfo,'movie','rate','timeStamp','uid') as(movie,rate,ts,uid) from t_json) tmp ;
分析函数:row_number() over()——分组TOPN
需求
有如下数据:
1,18,a,male 2,19,b,male 3,22,c,female 4,16,d,female 5,30,e,male 6,26,f,female
需要查询出每种性别中年龄最大的2条数据
实现:
使用row_number函数,对表中的数据按照性别分组,按照年龄倒序排序并进行标记
hql代码:
select id,age,name,sex, row_number() over(partition by sex order by age desc) as rank from t_rownumber
产生结果:
然后,利用上面的结果,查询出rank<=2的即为最终需求
select id,age,name,sex from (select id,age,name,sex, row_number() over(partition by sex order by age desc) as rank from t_rownumber) tmp where rank<=2;
练习:求出电影评分数据中,每个用户评分最高的topn条数据
hive 窗口分析函数
0: jdbc:hive2://localhost:10000> select * from t_access;
+----------------+---------------------------------+-----------------------+--------------+--+ | t_access.ip | t_access.url | t_access.access_time | t_access.dt | +----------------+---------------------------------+-----------------------+--------------+--+ | 192.168.33.3 | http://www.edu360.cn/stu | 2017-08-04 15:30:20 | 20170804 | | 192.168.33.3 | http://www.edu360.cn/teach | 2017-08-04 15:35:20 | 20170804 | | 192.168.33.4 | http://www.edu360.cn/stu | 2017-08-04 15:30:20 | 20170804 | | 192.168.33.4 | http://www.edu360.cn/job | 2017-08-04 16:30:20 | 20170804 | | 192.168.33.5 | http://www.edu360.cn/job | 2017-08-04 15:40:20 | 20170804 | | 192.168.33.3 | http://www.edu360.cn/stu | 2017-08-05 15:30:20 | 20170805 | | 192.168.44.3 | http://www.edu360.cn/teach | 2017-08-05 15:35:20 | 20170805 | | 192.168.33.44 | http://www.edu360.cn/stu | 2017-08-05 15:30:20 | 20170805 | | 192.168.33.46 | http://www.edu360.cn/job | 2017-08-05 16:30:20 | 20170805 | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-05 15:40:20 | 20170805 | | 192.168.133.3 | http://www.edu360.cn/register | 2017-08-06 15:30:20 | 20170806 | | 192.168.111.3 | http://www.edu360.cn/register | 2017-08-06 15:35:20 | 20170806 | | 192.168.34.44 | http://www.edu360.cn/pay | 2017-08-06 15:30:20 | 20170806 | | 192.168.33.46 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 20170806 | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 20170806 | | 192.168.33.46 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 20170806 | | 192.168.33.25 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 20170806 | | 192.168.33.36 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 20170806 | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 20170806 | +----------------+---------------------------------+-----------------------+--------------+--+
LAG函数
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lag(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;
LEAD函数
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lead(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;
+----------------+---------------------------------+----------------------+-----+----------------------+--+ | ip | url | access_time | rn | last_access_time | +----------------+---------------------------------+----------------------+-----+----------------------+--+ | 192.168.111.3 | http://www.edu360.cn/register | 2017-08-06 15:35:20 | 1 | 0 | | 192.168.133.3 | http://www.edu360.cn/register | 2017-08-06 15:30:20 | 1 | 0 | | 192.168.33.25 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 1 | 0 | | 192.168.33.3 | http://www.edu360.cn/stu | 2017-08-04 15:30:20 | 1 | 2017-08-04 15:35:20 | | 192.168.33.3 | http://www.edu360.cn/teach | 2017-08-04 15:35:20 | 2 | 2017-08-05 15:30:20 | | 192.168.33.3 | http://www.edu360.cn/stu | 2017-08-05 15:30:20 | 3 | 0 | | 192.168.33.36 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 1 | 0 | | 192.168.33.4 | http://www.edu360.cn/stu | 2017-08-04 15:30:20 | 1 | 2017-08-04 16:30:20 | | 192.168.33.4 | http://www.edu360.cn/job | 2017-08-04 16:30:20 | 2 | 0 | | 192.168.33.44 | http://www.edu360.cn/stu | 2017-08-05 15:30:20 | 1 | 0 | | 192.168.33.46 | http://www.edu360.cn/job | 2017-08-05 16:30:20 | 1 | 2017-08-06 16:30:20 | | 192.168.33.46 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 2 | 2017-08-06 16:30:20 | | 192.168.33.46 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 3 | 0 | | 192.168.33.5 | http://www.edu360.cn/job | 2017-08-04 15:40:20 | 1 | 0 | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-05 15:40:20 | 1 | 2017-08-06 15:40:20 | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 2 | 2017-08-06 15:40:20 | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 3 | 0 | | 192.168.34.44 | http://www.edu360.cn/pay | 2017-08-06 15:30:20 | 1 | 0 | | 192.168.44.3 | http://www.edu360.cn/teach | 2017-08-05 15:35:20 | 1 | 0 | +----------------+---------------------------------+----------------------+-----+----------------------+--+
FIRST_VALUE 函数
例:取每个用户访问的第一个页面
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
first_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;
+----------------+---------------------------------+----------------------+-----+---------------------------------+--+ | ip | url | access_time | rn | last_access_time | +----------------+---------------------------------+----------------------+-----+---------------------------------+--+ | 192.168.111.3 | http://www.edu360.cn/register | 2017-08-06 15:35:20 | 1 | http://www.edu360.cn/register | | 192.168.133.3 | http://www.edu360.cn/register | 2017-08-06 15:30:20 | 1 | http://www.edu360.cn/register | | 192.168.33.25 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 1 | http://www.edu360.cn/job | | 192.168.33.3 | http://www.edu360.cn/stu | 2017-08-04 15:30:20 | 1 | http://www.edu360.cn/stu | | 192.168.33.3 | http://www.edu360.cn/teach | 2017-08-04 15:35:20 | 2 | http://www.edu360.cn/stu | | 192.168.33.3 | http://www.edu360.cn/stu | 2017-08-05 15:30:20 | 3 | http://www.edu360.cn/stu | | 192.168.33.36 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 1 | http://www.edu360.cn/excersize | | 192.168.33.4 | http://www.edu360.cn/stu | 2017-08-04 15:30:20 | 1 | http://www.edu360.cn/stu | | 192.168.33.4 | http://www.edu360.cn/job | 2017-08-04 16:30:20 | 2 | http://www.edu360.cn/stu | | 192.168.33.44 | http://www.edu360.cn/stu | 2017-08-05 15:30:20 | 1 | http://www.edu360.cn/stu | | 192.168.33.46 | http://www.edu360.cn/job | 2017-08-05 16:30:20 | 1 | http://www.edu360.cn/job | | 192.168.33.46 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 2 | http://www.edu360.cn/job | | 192.168.33.46 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 3 | http://www.edu360.cn/job | | 192.168.33.5 | http://www.edu360.cn/job | 2017-08-04 15:40:20 | 1 | http://www.edu360.cn/job | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-05 15:40:20 | 1 | http://www.edu360.cn/job | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 2 | http://www.edu360.cn/job | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 3 | http://www.edu360.cn/job | | 192.168.34.44 | http://www.edu360.cn/pay | 2017-08-06 15:30:20 | 1 | http://www.edu360.cn/pay | | 192.168.44.3 | http://www.edu360.cn/teach | 2017-08-05 15:35:20 | 1 | http://www.edu360.cn/teach | +----------------+---------------------------------+----------------------+-----+---------------------------------+--+
LAST_VALUE 函数
例:取每个用户访问的最后一个页面
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
last_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;
+----------------+---------------------------------+----------------------+-----+---------------------------------+--+ | ip | url | access_time | rn | last_access_time | +----------------+---------------------------------+----------------------+-----+---------------------------------+--+ | 192.168.111.3 | http://www.edu360.cn/register | 2017-08-06 15:35:20 | 1 | http://www.edu360.cn/register | | 192.168.133.3 | http://www.edu360.cn/register | 2017-08-06 15:30:20 | 1 | http://www.edu360.cn/register | | 192.168.33.25 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 1 | http://www.edu360.cn/job | | 192.168.33.3 | http://www.edu360.cn/stu | 2017-08-04 15:30:20 | 1 | http://www.edu360.cn/stu | | 192.168.33.3 | http://www.edu360.cn/teach | 2017-08-04 15:35:20 | 2 | http://www.edu360.cn/stu | | 192.168.33.3 | http://www.edu360.cn/stu | 2017-08-05 15:30:20 | 3 | http://www.edu360.cn/stu | | 192.168.33.36 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 1 | http://www.edu360.cn/excersize | | 192.168.33.4 | http://www.edu360.cn/stu | 2017-08-04 15:30:20 | 1 | http://www.edu360.cn/stu | | 192.168.33.4 | http://www.edu360.cn/job | 2017-08-04 16:30:20 | 2 | http://www.edu360.cn/stu | | 192.168.33.44 | http://www.edu360.cn/stu | 2017-08-05 15:30:20 | 1 | http://www.edu360.cn/stu | | 192.168.33.46 | http://www.edu360.cn/job | 2017-08-05 16:30:20 | 1 | http://www.edu360.cn/job | | 192.168.33.46 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 2 | http://www.edu360.cn/job | | 192.168.33.46 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 3 | http://www.edu360.cn/job | | 192.168.33.5 | http://www.edu360.cn/job | 2017-08-04 15:40:20 | 1 | http://www.edu360.cn/job | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-05 15:40:20 | 1 | http://www.edu360.cn/job | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 2 | http://www.edu360.cn/job | | 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 3 | http://www.edu360.cn/job | | 192.168.34.44 | http://www.edu360.cn/pay | 2017-08-06 15:30:20 | 1 | http://www.edu360.cn/pay | | 192.168.44.3 | http://www.edu360.cn/teach | 2017-08-05 15:35:20 | 1 | http://www.edu360.cn/teach | +----------------+---------------------------------+----------------------+-----+---------------------------------+--+
累计报表--分析函数实现版
-- sum() over() 函数
select id ,month ,sum(amount) over(partition by id order by month rows between unbounded preceding and current row) from (select id,month, sum(fee) as amount from t_test group by id,month) tmp;
自定义函数
需求:
需要对json数据表中的json数据写一个自定义函数,用于传入一个json,返回一个数据值的数组
json原始数据表:
需要做ETL操作,将json数据变成普通表数据,插入另一个表中:
实现步骤:
1、开发JAVA的UDF类
public class ParseJson extends UDF{ // 重载 :返回值类型 和参数类型及个数,完全由用户自己决定 // 本处需求是:给一个字符串,返回一个数组 public String[] evaluate(String json) { String[] split = json.split("\""); String[] res = new String[]{split[3],split[7],split[11],split[15]}; return res; } }
2、打jar包
在eclipse中使用export即可
3、上传jar包到运行hive所在的linux机器
4、在hive中创建临时函数:
在hive的提示符中:
hive> add jar /root/jsonparse.jar;
然后,在hive的提示符中,创建一个临时函数:
hive>CREATE TEMPORARY FUNCTION jsonp AS 'cn.edu360.hdp.hive.ParseJson';
开发hql语句,利用自定义函数,从原始表中抽取数据插入新表
insert into table t_rate select split(jsonp(json),',')[0], cast(split(jsonp(json),',')[1] as int), cast(split(jsonp(json),',')[2] as bigint), cast(split(jsonp(json),',')[3] as int) from t_rating_json;
注:临时函数只在一次hive会话中有效,重启会话后就无效
如果需要经常使用该自定义函数,可以考虑创建永久函数:
拷贝jar包到hive的类路径中:
cp wc.jar apps/hive-1.2.1/lib/
创建了:
create function pfuncx as 'com.doit.hive.udf.UserInfoParser';
删除函数:
DROP TEMPORARY FUNCTION [IF EXISTS] function_name
DROP FUNCTION[IF EXISTS] function_name
此笔记:整理自小牛课堂
=