hive基本查询
--hive基本查询 select * from a; select max(ip) from a; select count(*) from a;
hive 条件查询
--hive 条件查询 select * from t_pv_log where access_time>'2020-11-26 15:31:33';
关联查询
1.内连接
--1.内连接(笛卡尔积),右边字段依次和左边的连一遍 select t_a.*,t_b.* from a t_a inner join b t_b; +---------+-----------+---------+-----------+--+ | t_a.id | t_a.name | t_b.id | t_b.name | +---------+-----------+---------+-----------+--+ | 1 | a | 2 | bb | | 1 | a | 3 | cc | | 1 | a | 7 | yy | | 1 | a | 9 | pp | | 2 | b | 2 | bb | | 2 | b | 3 | cc | | 2 | b | 7 | yy | | 2 | b | 9 | pp | | 3 | c | 2 | bb | | 3 | c | 3 | cc | | 3 | c | 7 | yy | | 3 | c | 9 | pp | | 4 | d | 2 | bb | | 4 | d | 3 | cc | | 4 | d | 7 | yy | | 4 | d | 9 | pp | | 7 | y | 2 | bb | | 7 | y | 3 | cc | | 7 | y | 7 | yy | | 7 | y | 9 | pp | | 8 | u | 2 | bb | | 8 | u | 3 | cc | | 8 | u | 7 | yy | | 8 | u | 9 | pp | +---------+-----------+---------+-----------+--+ --指定条件 select t_a.*,t_b.* from a t_a inner join b t_b on t_a.id=t_b.id; +---------+-----------+---------+-----------+--+ | t_a.id | t_a.name | t_b.id | t_b.name | +---------+-----------+---------+-----------+--+ | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 7 | y | 7 | yy | +---------+-----------+---------+-----------+--+
2.左连接
--2.左外连接(左连接),左边字段依次和右边的连一遍 select a.*,b.* from a left join b on a.id=b.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 1 | a | NULL | NULL | | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 4 | d | NULL | NULL | | 7 | y | 7 | yy | | 8 | u | NULL | NULL | +-------+---------+-------+---------+--+ --加条件:保留左边所有字段,右边没有符合左边字段的就为null
3.右连接
--3.右外连接(右连接),不加条件和内联接查询结果相同 select a.*,b.* from a right join b on a.id=b.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 7 | y | 7 | yy | | NULL | NULL | 9 | pp | +-------+---------+-------+---------+--+ --加条件:保留右边所有字段,左边没有符合右边字段的就为null
4.全外连接
--4.全外连接 select a.*,b.* from a full outer join b on a.id=b.id; +-------+---------+-------+---------+--+ | a.id | a.name | b.id | b.name | +-------+---------+-------+---------+--+ | 1 | a | NULL | NULL | | 2 | b | 2 | bb | | 3 | c | 3 | cc | | 4 | d | NULL | NULL | | 7 | y | 7 | yy | | 8 | u | NULL | NULL | | NULL | NULL | 9 | pp | +-------+---------+-------+---------+--+ --加条件:左边右边字段都会保存
5.左半连接(hive独有)
--5.左半连接(hive独有) select a.* from a left semi join b on a.id=b.id; +-------+---------+--+ | a.id | a.name | +-------+---------+--+ | 2 | b | | 3 | c | | 7 | y | +-------+---------+--+ --查询得到与右表相同字段数据的左表数据(故select子句中不能有右表字段)
group by 分组聚合查询
select upper("abcde");--转大写函数 -- 针对每一行计算,指定字段的每行数据都会变 select ip,upper(url),access_time from t_pv_log; --求每条url的访问总次数 +---------------+--------------------+-----------------------+---------------+--+ | t_pv_log.ip | t_pv_log.url | t_pv_log.access_time | t_pv_log.day | +---------------+--------------------+-----------------------+---------------+--+ | 192.168.22.1 | http://sina.com/a | 2020-11-26 15:31:23 | 20201126 | | 192.168.22.4 | http://sina.com/a | 2020-11-26 16:21:33 | 20201126 | | 192.168.33.1 | http://sina.com/a | 2020-11-26 17:11:43 | 20201126 | | 192.168.22.1 | http://sina.com/a | 2020-11-27 15:31:23 | 20201127 | | 192.168.22.4 | http://sina.com/a | 2020-11-27 16:21:33 | 20201127 | | 192.168.33.1 | http://sina.com/a | 2020-11-27 17:11:43 | 20201127 | | 192.168.12.6 | http://sina.com/v | 2020-11-26 15:31:33 | 20201126 | | 192.168.12.6 | http://sina.com/v | 2020-11-27 15:31:33 | 20201127 | | 192.168.22.7 | http://sina.com/f | 2020-11-26 15:31:36 | 20201126 | | 192.168.22.7 | http://sina.com/f | 2020-11-27 15:31:36 | 20201127 | | 192.168.22.1 | http://sina.com/u | 2020-11-26 15:31:33 | 20201126 | | 192.168.22.1 | http://sina.com/u | 2020-11-27 15:31:33 | 20201127 | +---------------+--------------------+-----------------------+---------------+--+ select url,count(1) as cnts from t_pv_log group by url; +--------------------+-------+--+ | url | cnts | +--------------------+-------+--+ | http://sina.com/a | 6 | | http://sina.com/f | 2 | | http://sina.com/u | 2 | | http://sina.com/v | 2 | +--------------------+-------+--+ --语法:group by 字段(字段中相同的数据划分为一组) --注:只能按组查询字段,故select 被分组的那个字段,对分好组的数据进行逐组运算。 --求每个url访问者中ip地址最大的 select url,max(ip) from t_pv_log group by url; --max():分组聚合函数(对一组数据即多行进行运算) +--------------------+---------------+--+ | url | _c1 | +--------------------+---------------+--+ | http://sina.com/a | 192.168.33.1 | | http://sina.com/f | 192.168.22.7 | | http://sina.com/u | 192.168.22.1 | | http://sina.com/v | 192.168.12.6 | +--------------------+---------------+--+ --求每个用户访问同一页面的所有记录中时间最晚的一条 --表分组的样子 +---------------+--------------------+-----------------------+---------------+--+ | t_pv_log.ip | t_pv_log.url | t_pv_log.access_time | t_pv_log.day | +---------------+--------------------+-----------------------+---------------+--+ | 192.168.22.1 | http://sina.com/a | 2020-11-26 15:31:23 | 20201126 | | 192.168.22.1 | http://sina.com/a | 2020-11-27 15:31:23 | 20201127 | 192.168.22.4 | http://sina.com/a | 2020-11-26 16:21:33 | 20201126 | | 192.168.22.4 | http://sina.com/a | 2020-11-27 16:21:33 | 20201127 | | 192.168.33.1 | http://sina.com/a | 2020-11-26 17:11:43 | 20201126 | | 192.168.33.1 | http://sina.com/a | 2020-11-27 17:11:43 | 20201127 | | 192.168.12.6 | http://sina.com/v | 2020-11-26 15:31:33 | 20201126 | | 192.168.12.6 | http://sina.com/v | 2020-11-27 15:31:33 | 20201127 | | 192.168.22.7 | http://sina.com/f | 2020-11-26 15:31:36 | 20201126 | | 192.168.22.7 | http://sina.com/f | 2020-11-27 15:31:36 | 20201127 | | 192.168.22.1 | http://sina.com/u | 2020-11-26 15:31:33 | 20201126 | | 192.168.22.1 | http://sina.com/u | 2020-11-27 15:31:33 | 20201127 | +---------------+--------------------+-----------------------+---------------+--+ select ip,url,max(access_time) as time --查询的都是分组字段和聚合函数 from t_pv_log group by ip,url; +---------------+--------------------+----------------------+--+ | ip | url | time | +---------------+--------------------+----------------------+--+ | 192.168.12.6 | http://sina.com/v | 2020-11-27 15:31:33 | | 192.168.22.1 | http://sina.com/a | 2020-11-27 15:31:23 | | 192.168.22.1 | http://sina.com/u | 2020-11-27 15:31:33 | | 192.168.22.4 | http://sina.com/a | 2020-11-27 16:21:33 | | 192.168.22.7 | http://sina.com/f | 2020-11-27 15:31:36 | | 192.168.33.1 | http://sina.com/a | 2020-11-27 17:11:43 | +---------------+--------------------+----------------------+--+ --注:结果有多少行就被分了多少组
分组聚合综合示例
--分组聚合综合示例 --1.建表(分区表) create table t_access(ip string,url string,access_time date) partitioned by (dt string) row format delimited fields terminated by ','; --2.编辑分区文件 vi access.log.20201211 vi access.log.20201212 vi access.log.20201213 --3.导入数据 load data local inpath '/root/hivetest/access.log.20201211' into table t_access partition(dt='20201211'); load data local inpath '/root/hivetest/access.log.20201212' into table t_access partition(dt='20201212'); load data local inpath '/root/hivetest/access.log.20201213' into table t_access partition(dt='20201213'); --4.查询 --求12月11日以后每天http://www.edu360.cn/job总访问次数及访问者中ip地址最大的,显示url --方法一: select dt,'http://www.edu360.cn/job',count(1),max(ip) from t_access where url='http://www.edu360.cn/job' group by dt having dt>'20201211'; --方法二: select dt,max(url),count(1),max(ip) from t_access where url='http://www.edu360.cn/job' group by dt having dt>'20201211'; --方法三: select dt,url,count(1),max(ip) from t_access where url='http://www.edu360.cn/job' group by dt,url having dt>'20201211'; +-----------+---------------------------+------+----------------+--+ | dt | url | _c2 | _c3 | +-----------+---------------------------+------+----------------+--+ | 20201212 | http://www.edu360.cn/job | 2 | 192.168.33.55 | | 20201213 | http://www.edu360.cn/job | 3 | 192.168.33.55 | +-----------+---------------------------+------+----------------+--+ -- 求12月11号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的 select dt,url,count(1),max(ip) from t_access where dt>'20201211' group by dt,url; +-----------+---------------------------------+------+----------------+--+ | dt | url | _c2 | _c3 | +-----------+---------------------------------+------+----------------+--+ | 20201212 | http://www.edu360.cn/job | 2 | 192.168.33.55 | | 20201212 | http://www.edu360.cn/stu | 2 | 192.168.33.44 | | 20201212 | http://www.edu360.cn/teach | 1 | 192.168.44.3 | | 20201213 | http://www.edu360.cn/excersize | 3 | 192.168.33.46 | | 20201213 | http://www.edu360.cn/job | 3 | 192.168.33.55 | | 20201213 | http://www.edu360.cn/pay | 1 | 192.168.34.44 | | 20201213 | http://www.edu360.cn/register | 2 | 192.168.133.3 | +-----------+---------------------------------+------+----------------+--+
hive 子查询
--hive 子查询 -- 求12月11号以后,每天每个页面的总访问次数,及访问者中ip地址中最大的,且只查询出总访问次数>2 的记录 -- 方式1: select dt,url,count(1) as cnts,max(ip) from t_access where dt>'20201211' group by dt,url having cnts>'2'; -- 方式2:用子查询 select dt,url,cnts,ip from (select dt,url,count(1) as cnts,max(ip) as ip from t_access where dt>'20201211' group by dt,url) tmp where cnts>'2'; +-----------+---------------------------------+-------+----------------+--+ | dt | url | cnts | ip | +-----------+---------------------------------+-------+----------------+--+ | 20201213 | http://www.edu360.cn/excersize | 3 | 192.168.33.46 | | 20201213 | http://www.edu360.cn/job | 3 | 192.168.33.55 | +-----------+---------------------------------+-------+----------------+--+
hive 数据类型
数字类型、日期时间类型、字符串类型、混杂类型、复合类型(hive独有)
1.array数组类型
--1.array数组类型 --建表 create table t_movie(movie_name string,actors array<string>,first_show date) row format delimited fields terminated by ',' collection items terminated by ':'; --vi movie.data 无名之辈,章宇:陈建斌:任素汐,2018-11-16 我不是药神,徐峥:王传君:章宇,2018-07-05 一出好戏,黄渤:王宝强:舒淇,2018-08-10 中国机长,欧豪:杜江:袁泉,2018-05-18 囧妈,徐峥:黄梅莹:袁泉,2020-01-25 --导数据 load data local inpath '/root/hivetest/movie.dat' into table t_movie ; --查询 select movie_name,actors[0],first_show from t_movie; +-------------+------+-------------+--+ | movie_name | _c1 | first_show | +-------------+------+-------------+--+ | 无名之辈 | 章宇 | 2018-11-16 | | 我不是药神 | 徐峥 | 2018-07-05 | | 一出好戏 | 黄渤 | 2018-08-10 | | 中国机长 | 欧豪 | 2018-05-18 | | 囧妈 | 徐峥 | 2020-01-25 | +-------------+------+-------------+--+ --查询徐峥参演的电影 select movie_name,actors,first_show from t_movie where array_contains(actors,'徐峥') ; +-------------+--------------------+-------------+--+ | movie_name | actors | first_show | +-------------+--------------------+-------------+--+ | 我不是药神 | ["徐峥","王传君","章宇"] | 2018-07-05 | | 囧妈 | ["徐峥","黄梅莹","袁泉"] | 2020-01-25 | +-------------+--------------------+-------------+--+ --查询电影参演人数 select movie_name ,size(actors) as act_num ,first_show from t_movie; +-------------+----------+-------------+--+ | movie_name | act_num | first_show | +-------------+----------+-------------+--+ | 无名之辈 | 3 | 2018-11-16 | | 我不是药神 | 3 | 2018-07-05 | | 一出好戏 | 3 | 2018-08-10 | | 中国机长 | 3 | 2018-05-18 | | 囧妈 | 3 | 2020-01-25 | +-------------+----------+-------------+--+
2.map类型
--2.map类型 --建表 create table t_family(id int,name string,familys map<string,string>,age int) row format delimited fields terminated by ',' collection items terminated by '#' map keys terminated by ':'; --vi family.dat 1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28 2,lisi,father:mayun#mother:huangyi#brother:guanyu,22 3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29 4,mayun,father:mayongzhen#mother:angelababy,26 --导数据 load data local inpath '/root/hivetest/family.dat' into table t_family; --查出每个人的爸爸 select id,name,familys['father'],age from t_family; +-----+-----------+--------------+------+--+ | id | name | _c2 | age | +-----+-----------+--------------+------+--+ | 1 | zhangsan | xiaoming | 28 | | 2 | lisi | mayun | 22 | | 3 | wangwu | wangjianlin | 29 | | 4 | mayun | mayongzhen | 26 | +-----+-----------+--------------+------+--+ --查询出每个人有哪些亲属关系 select id,name,map_keys(familys) as relations,age from t_family; +-----+-----------+--------------------------------+------+--+ | id | name | relations | age | +-----+-----------+--------------------------------+------+--+ | 1 | zhangsan | ["father","mother","brother"] | 28 | | 2 | lisi | ["father","mother","brother"] | 22 | | 3 | wangwu | ["father","mother","sister"] | 29 | | 4 | mayun | ["father","mother"] | 26 | +-----+-----------+--------------------------------+------+--+ -- 查出每个人的亲人名字 select id,name,map_values(familys) as f_name,age from t_family; +-----+-----------+-------------------------------------+------+--+ | id | name | f_name | age | +-----+-----------+-------------------------------------+------+--+ | 1 | zhangsan | ["xiaoming","xiaohuang","xiaoxu"] | 28 | | 2 | lisi | ["mayun","huangyi","guanyu"] | 22 | | 3 | wangwu | ["wangjianlin","ruhua","jingtian"] | 29 | | 4 | mayun | ["mayongzhen","angelababy"] | 26 | +-----+-----------+-------------------------------------+------+--+ -- 查出每个人的亲人数量 select id,name,size(familys),age from t_family; +-----+-----------+------+------+--+ | id | name | _c2 | age | +-----+-----------+------+------+--+ | 1 | zhangsan | 3 | 28 | | 2 | lisi | 3 | 22 | | 3 | wangwu | 3 | 29 | | 4 | mayun | 2 | 26 | +-----+-----------+------+------+--+ -- 查出所有拥有兄弟的人及他的兄弟是谁 --方法1 select id,name,familys['brother'],age from t_family where array_contains(map_keys(familys),'brother'); --方法2 select id,name,age,familys['brother'] from (select id,name,age,map_keys(familys) as relations,familys from t_family ) tmp where array_contains(relations,'brother'); +-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+ | id | name | age | relations | familys | +-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+ | 1 | zhangsan | 28 | ["father","mother","brother"] | {"father":"xiaoming","mother":"xiaohuang","brother":"xiaoxu"} | | 2 | lisi | 22 | ["father","mother","brother"] | {"father":"mayun","mother":"huangyi","brother":"guanyu"} | | 3 | wangwu | 29 | ["father","mother","sister"] | {"father":"wangjianlin","mother":"ruhua","sister":"jingtian"} | | 4 | mayun | 26 | ["father","mother"] | {"father":"mayongzhen","mother":"angelababy"} | +-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+ +-----+-----------+------+---------+--+ | id | name | age | _c3 | +-----+-----------+------+---------+--+ | 1 | zhangsan | 28 | xiaoxu | | 2 | lisi | 22 | guanyu | +-----+-----------+------+---------+--+
3.struct类型
--3.struct类型 --建表 create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>) row format delimited fields terminated by ',' collection items terminated by ':'; --vi user.dat 1,zhangsan,18:male:深圳 2,lisi,28:female:北京 3,wangwu,38:male:广州 4,赵六,26:female:上海 5,钱琪,35:male:杭州 6,王八,48:female:南京 --导数据 load data local inpath '/root/hivetest/user.dat' into table t_user; --查询每个人的id,name,和地址 select id,name,info.addr from t_user; +-----+-----------+-------+--+ | id | name | addr | +-----+-----------+-------+--+ | 1 | zhangsan | 深圳 | | 2 | lisi | 北京 | | 3 | wangwu | 广州 | | 4 | 赵六 | 上海 | | 5 | 钱琪 | 杭州 | | 6 | 王八 | 南京 | +-----+-----------+-------+--+
hive 的函数
字符串函数、类型转换函数、数学运算函数、时间函数、表生成函数
--字符串函数 select substr("abcgfik",1,3); --用常量测试函数 select substr("abcgfik",0,3); +------+--+ | _c0 | +------+--+ | abc | +------+--+ --结果一致
表生成函数
--表生成函数(explode() lateral view的使用) --wordcount示例 --建表 create table t_wc(sentence string); --vi word.dat select name string name name select from id info select select id id name info string from id id string name --导数据 load data local inpath '/root/hivetest/word.dat' into table t_wc; --查询 select explode(split(sentence,' ')) as word --按空格切割,形成数组,利用行转列函数explode()炸开 from t_wc; +---------+--+ | word | +---------+--+ | select | | name | | string | | name | | name | | select | | from | | id | | info | | select | | select | | id | | id | | name | | info | | string | | from | | id | | id | | string | | name | +---------+--+ select word,count(1) as conts from ( select explode(split(sentence,' ')) as word from t_wc ) tmp group by word; +---------+--------+--+ | word | conts | +---------+--------+--+ | from | 2 | | id | 5 | | info | 2 | | name | 5 | | select | 4 | | string | 3 | +---------+--------+--+
lateral view
--lateral view --建表 create table t_subject(id int,name string,subjects array<string>) row format delimited fields terminated by ' ' collection items terminated by ','; --vi subjects.txt 001 zhangsan 物理,化学,数学,英语,生物 002 lisi 生物,历史,数学,语文,化学 003 wangwu 体育,美术,化学,语文,英语,数学 --导数据 load data local inpath '/root/hivetest/subject.txt' into table t_subject; --查询有哪些课程 select distinct tmp.sub from( select explode(subjects) as sub from t_subject ) tmp; +----------+--+ | tmp.sub | +----------+--+ | 体育 | | 化学 | | 历史 | | 数学 | | 物理 | | 生物 | | 美术 | | 英语 | | 语文 | +----------+--+ --lateral view 示例(lateral view将explode函数的结果生成新表) select id,name,tmp.sub from t_subject lateral view explode(subjects) tmp as sub ; +-----+-----------+----------+--+ | id | name | tmp.sub | +-----+-----------+----------+--+ | 1 | zhangsan | 物理 | | 1 | zhangsan | 化学 | | 1 | zhangsan | 数学 | | 1 | zhangsan | 英语 | | 1 | zhangsan | 生物 | | 2 | lisi | 生物 | | 2 | lisi | 历史 | | 2 | lisi | 数学 | | 2 | lisi | 语文 | | 2 | lisi | 化学 | | 3 | wangwu | 体育 | | 3 | wangwu | 美术 | | 3 | wangwu | 化学 | | 3 | wangwu | 语文 | | 3 | wangwu | 英语 | | 3 | wangwu | 数学 | +-----+-----------+----------+--+