hive查询的相关示例-阿里云开发者社区

hive基本查询

--hive基本查询
select * from a;
select max(ip) from a;
select count(*) from a;

hive 条件查询

--hive 条件查询
select *
from t_pv_log
where access_time>'2020-11-26 15:31:33';

关联查询

1.内连接

--1.内连接（笛卡尔积），右边字段依次和左边的连一遍
select t_a.*,t_b.*
from a t_a inner join b t_b;
+---------+-----------+---------+-----------+--+
| t_a.id  | t_a.name  | t_b.id  | t_b.name  |
+---------+-----------+---------+-----------+--+
| 1       | a         | 2       | bb        |
| 1       | a         | 3       | cc        |
| 1       | a         | 7       | yy        |
| 1       | a         | 9       | pp        |
| 2       | b         | 2       | bb        |
| 2       | b         | 3       | cc        |
| 2       | b         | 7       | yy        |
| 2       | b         | 9       | pp        |
| 3       | c         | 2       | bb        |
| 3       | c         | 3       | cc        |
| 3       | c         | 7       | yy        |
| 3       | c         | 9       | pp        |
| 4       | d         | 2       | bb        |
| 4       | d         | 3       | cc        |
| 4       | d         | 7       | yy        |
| 4       | d         | 9       | pp        |
| 7       | y         | 2       | bb        |
| 7       | y         | 3       | cc        |
| 7       | y         | 7       | yy        |
| 7       | y         | 9       | pp        |
| 8       | u         | 2       | bb        |
| 8       | u         | 3       | cc        |
| 8       | u         | 7       | yy        |
| 8       | u         | 9       | pp        |
+---------+-----------+---------+-----------+--+
--指定条件 
select t_a.*,t_b.*
from a t_a inner join b t_b
on t_a.id=t_b.id;
+---------+-----------+---------+-----------+--+
| t_a.id  | t_a.name  | t_b.id  | t_b.name  |
+---------+-----------+---------+-----------+--+
| 2       | b         | 2       | bb        |
| 3       | c         | 3       | cc        |
| 7       | y         | 7       | yy        |
+---------+-----------+---------+-----------+--+

2.左连接

--2.左外连接（左连接），左边字段依次和右边的连一遍
select a.*,b.*
from a left join b
on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | a       | NULL  | NULL    |
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 4     | d       | NULL  | NULL    |
| 7     | y       | 7     | yy      |
| 8     | u       | NULL  | NULL    |
+-------+---------+-------+---------+--+
--加条件：保留左边所有字段，右边没有符合左边字段的就为null

3.右连接

--3.右外连接（右连接），不加条件和内联接查询结果相同
select a.*,b.*
from a right join b
on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 7     | y       | 7     | yy      |
| NULL  | NULL    | 9     | pp      |
+-------+---------+-------+---------+--+
--加条件：保留右边所有字段，左边没有符合右边字段的就为null

4.全外连接

--4.全外连接
select a.*,b.*
from a full outer join b
on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | a       | NULL  | NULL    |
| 2     | b       | 2     | bb      |
| 3     | c       | 3     | cc      |
| 4     | d       | NULL  | NULL    |
| 7     | y       | 7     | yy      |
| 8     | u       | NULL  | NULL    |
| NULL  | NULL    | 9     | pp      |
+-------+---------+-------+---------+--+
--加条件：左边右边字段都会保存

5.左半连接（hive独有）

--5.左半连接（hive独有）
select a.*
from a left semi join b
on a.id=b.id;
+-------+---------+--+
| a.id  | a.name  |
+-------+---------+--+
| 2     | b       |
| 3     | c       |
| 7     | y       |
+-------+---------+--+
--查询得到与右表相同字段数据的左表数据（故select子句中不能有右表字段）

group by 分组聚合查询

select upper("abcde");--转大写函数
-- 针对每一行计算，指定字段的每行数据都会变
select ip,upper(url),access_time
from t_pv_log;
--求每条url的访问总次数
+---------------+--------------------+-----------------------+---------------+--+
|  t_pv_log.ip  |    t_pv_log.url    | t_pv_log.access_time  | t_pv_log.day  |
+---------------+--------------------+-----------------------+---------------+--+
| 192.168.22.1  | http://sina.com/a  | 2020-11-26 15:31:23   | 20201126      |
| 192.168.22.4  | http://sina.com/a  | 2020-11-26 16:21:33   | 20201126      |
| 192.168.33.1  | http://sina.com/a  | 2020-11-26 17:11:43   | 20201126      |
| 192.168.22.1  | http://sina.com/a  | 2020-11-27 15:31:23   | 20201127      |
| 192.168.22.4  | http://sina.com/a  | 2020-11-27 16:21:33   | 20201127      |
| 192.168.33.1  | http://sina.com/a  | 2020-11-27 17:11:43   | 20201127      |
| 192.168.12.6  | http://sina.com/v  | 2020-11-26 15:31:33   | 20201126      |
| 192.168.12.6  | http://sina.com/v  | 2020-11-27 15:31:33   | 20201127      |
| 192.168.22.7  | http://sina.com/f  | 2020-11-26 15:31:36   | 20201126      |
| 192.168.22.7  | http://sina.com/f  | 2020-11-27 15:31:36   | 20201127      |
| 192.168.22.1  | http://sina.com/u  | 2020-11-26 15:31:33   | 20201126      |
| 192.168.22.1  | http://sina.com/u  | 2020-11-27 15:31:33   | 20201127      |
+---------------+--------------------+-----------------------+---------------+--+
select url,count(1) as cnts
from t_pv_log
group by url;
+--------------------+-------+--+
|        url         | cnts  |
+--------------------+-------+--+
| http://sina.com/a  | 6     |
| http://sina.com/f  | 2     |
| http://sina.com/u  | 2     |
| http://sina.com/v  | 2     |
+--------------------+-------+--+
--语法：group  by 字段（字段中相同的数据划分为一组）
--注：只能按组查询字段，故select 被分组的那个字段，对分好组的数据进行逐组运算。
--求每个url访问者中ip地址最大的
select url,max(ip)
from t_pv_log
group by url;
--max():分组聚合函数（对一组数据即多行进行运算）
+--------------------+---------------+--+
|        url         |      _c1      |
+--------------------+---------------+--+
| http://sina.com/a  | 192.168.33.1  |
| http://sina.com/f  | 192.168.22.7  |
| http://sina.com/u  | 192.168.22.1  |
| http://sina.com/v  | 192.168.12.6  |
+--------------------+---------------+--+
--求每个用户访问同一页面的所有记录中时间最晚的一条
--表分组的样子
+---------------+--------------------+-----------------------+---------------+--+
|  t_pv_log.ip  |    t_pv_log.url    | t_pv_log.access_time  | t_pv_log.day  |
+---------------+--------------------+-----------------------+---------------+--+
| 192.168.22.1  | http://sina.com/a  | 2020-11-26 15:31:23   | 20201126      |
| 192.168.22.1  | http://sina.com/a  | 2020-11-27 15:31:23   | 20201127      
| 192.168.22.4  | http://sina.com/a  | 2020-11-26 16:21:33   | 20201126      |
| 192.168.22.4  | http://sina.com/a  | 2020-11-27 16:21:33   | 20201127      |
| 192.168.33.1  | http://sina.com/a  | 2020-11-26 17:11:43   | 20201126      |
| 192.168.33.1  | http://sina.com/a  | 2020-11-27 17:11:43   | 20201127      |
| 192.168.12.6  | http://sina.com/v  | 2020-11-26 15:31:33   | 20201126      |
| 192.168.12.6  | http://sina.com/v  | 2020-11-27 15:31:33   | 20201127      |
| 192.168.22.7  | http://sina.com/f  | 2020-11-26 15:31:36   | 20201126      |
| 192.168.22.7  | http://sina.com/f  | 2020-11-27 15:31:36   | 20201127      |
| 192.168.22.1  | http://sina.com/u  | 2020-11-26 15:31:33   | 20201126      |
| 192.168.22.1  | http://sina.com/u  | 2020-11-27 15:31:33   | 20201127      |
+---------------+--------------------+-----------------------+---------------+--+
select ip,url,max(access_time) as time --查询的都是分组字段和聚合函数
from t_pv_log
group by ip,url;
+---------------+--------------------+----------------------+--+
|      ip       |        url         |         time         |
+---------------+--------------------+----------------------+--+
| 192.168.12.6  | http://sina.com/v  | 2020-11-27 15:31:33  |
| 192.168.22.1  | http://sina.com/a  | 2020-11-27 15:31:23  |
| 192.168.22.1  | http://sina.com/u  | 2020-11-27 15:31:33  |
| 192.168.22.4  | http://sina.com/a  | 2020-11-27 16:21:33  |
| 192.168.22.7  | http://sina.com/f  | 2020-11-27 15:31:36  |
| 192.168.33.1  | http://sina.com/a  | 2020-11-27 17:11:43  |
+---------------+--------------------+----------------------+--+
--注：结果有多少行就被分了多少组

分组聚合综合示例

--分组聚合综合示例
--1.建表(分区表）
create table t_access(ip string,url string,access_time date)
partitioned by (dt string)
row format delimited 
fields terminated by ',';
--2.编辑分区文件
vi access.log.20201211
vi access.log.20201212
vi access.log.20201213
--3.导入数据
load data local inpath '/root/hivetest/access.log.20201211' into table t_access partition(dt='20201211');
load data local inpath '/root/hivetest/access.log.20201212' into table t_access partition(dt='20201212');
load data local inpath '/root/hivetest/access.log.20201213' into table t_access partition(dt='20201213');
--4.查询
--求12月11日以后每天http://www.edu360.cn/job总访问次数及访问者中ip地址最大的，显示url
--方法一：
select dt,'http://www.edu360.cn/job',count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt having dt>'20201211';
--方法二：
select dt,max(url),count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt having dt>'20201211';
--方法三：
select dt,url,count(1),max(ip)
from t_access
where url='http://www.edu360.cn/job'
group by dt,url having dt>'20201211';
+-----------+---------------------------+------+----------------+--+
|    dt     |            url            | _c2  |      _c3       |
+-----------+---------------------------+------+----------------+--+
| 20201212  | http://www.edu360.cn/job  | 2    | 192.168.33.55  |
| 20201213  | http://www.edu360.cn/job  | 3    | 192.168.33.55  |
+-----------+---------------------------+------+----------------+--+
-- 求12月11号以后，每天每个页面的总访问次数，及访问者中ip地址中最大的
select dt,url,count(1),max(ip)
from t_access
where dt>'20201211'
group by dt,url;
+-----------+---------------------------------+------+----------------+--+
|    dt     |               url               | _c2  |      _c3       |
+-----------+---------------------------------+------+----------------+--+
| 20201212  | http://www.edu360.cn/job        | 2    | 192.168.33.55  |
| 20201212  | http://www.edu360.cn/stu        | 2    | 192.168.33.44  |
| 20201212  | http://www.edu360.cn/teach      | 1    | 192.168.44.3   |
| 20201213  | http://www.edu360.cn/excersize  | 3    | 192.168.33.46  |
| 20201213  | http://www.edu360.cn/job        | 3    | 192.168.33.55  |
| 20201213  | http://www.edu360.cn/pay        | 1    | 192.168.34.44  |
| 20201213  | http://www.edu360.cn/register   | 2    | 192.168.133.3  |
+-----------+---------------------------------+------+----------------+--+

hive 子查询

--hive 子查询
-- 求12月11号以后，每天每个页面的总访问次数，及访问者中ip地址中最大的，且只查询出总访问次数>2 的记录
-- 方式1：
select dt,url,count(1) as cnts,max(ip)
from t_access
where dt>'20201211'
group by dt,url having cnts>'2';
-- 方式2：用子查询
select dt,url,cnts,ip
from 
(select dt,url,count(1) as cnts,max(ip) as ip
from t_access
where dt>'20201211'
group by dt,url) tmp
where cnts>'2';
+-----------+---------------------------------+-------+----------------+--+
|    dt     |               url               | cnts  |       ip       |
+-----------+---------------------------------+-------+----------------+--+
| 20201213  | http://www.edu360.cn/excersize  | 3     | 192.168.33.46  |
| 20201213  | http://www.edu360.cn/job        | 3     | 192.168.33.55  |
+-----------+---------------------------------+-------+----------------+--+

hive 数据类型

数字类型、日期时间类型、字符串类型、混杂类型、复合类型（hive独有）

1.array数组类型

--1.array数组类型
--建表
create table t_movie(movie_name string,actors array<string>,first_show date)
row  format  delimited fields terminated by ','
collection items terminated by ':';
--vi movie.data
无名之辈,章宇:陈建斌:任素汐,2018-11-16
我不是药神,徐峥:王传君:章宇,2018-07-05
一出好戏,黄渤:王宝强:舒淇,2018-08-10
中国机长,欧豪:杜江:袁泉,2018-05-18
囧妈,徐峥:黄梅莹:袁泉,2020-01-25
--导数据
load data local inpath '/root/hivetest/movie.dat' into table t_movie ;
--查询
select movie_name,actors[0],first_show from t_movie;
+-------------+------+-------------+--+
| movie_name  | _c1  | first_show  |
+-------------+------+-------------+--+
| 无名之辈        | 章宇   | 2018-11-16  |
| 我不是药神       | 徐峥   | 2018-07-05  |
| 一出好戏        | 黄渤   | 2018-08-10  |
| 中国机长        | 欧豪   | 2018-05-18  |
| 囧妈          | 徐峥   | 2020-01-25  |
+-------------+------+-------------+--+
--查询徐峥参演的电影
select movie_name,actors,first_show
from t_movie
where array_contains(actors,'徐峥') ;
+-------------+--------------------+-------------+--+
| movie_name  |       actors       | first_show  |
+-------------+--------------------+-------------+--+
| 我不是药神       | ["徐峥","王传君","章宇"]  | 2018-07-05  |
| 囧妈          | ["徐峥","黄梅莹","袁泉"]  | 2020-01-25  |
+-------------+--------------------+-------------+--+
--查询电影参演人数
select movie_name
,size(actors) as act_num
,first_show
from t_movie;
+-------------+----------+-------------+--+
| movie_name  | act_num  | first_show  |
+-------------+----------+-------------+--+
| 无名之辈        | 3        | 2018-11-16  |
| 我不是药神       | 3        | 2018-07-05  |
| 一出好戏        | 3        | 2018-08-10  |
| 中国机长        | 3        | 2018-05-18  |
| 囧妈          | 3        | 2020-01-25  |
+-------------+----------+-------------+--+

2.map类型

--2.map类型
--建表
create table t_family(id int,name string,familys map<string,string>,age int)
row format delimited fields terminated by ','
collection items terminated by '#'
map keys terminated by ':';
--vi family.dat
1,zhangsan,father:xiaoming#mother:xiaohuang#brother:xiaoxu,28
2,lisi,father:mayun#mother:huangyi#brother:guanyu,22
3,wangwu,father:wangjianlin#mother:ruhua#sister:jingtian,29
4,mayun,father:mayongzhen#mother:angelababy,26
--导数据
load data local inpath '/root/hivetest/family.dat' into table t_family;
--查出每个人的爸爸
select id,name,familys['father'],age
from t_family;
+-----+-----------+--------------+------+--+
| id  |   name    |     _c2      | age  |
+-----+-----------+--------------+------+--+
| 1   | zhangsan  | xiaoming     | 28   |
| 2   | lisi      | mayun        | 22   |
| 3   | wangwu    | wangjianlin  | 29   |
| 4   | mayun     | mayongzhen   | 26   |
+-----+-----------+--------------+------+--+
--查询出每个人有哪些亲属关系
select id,name,map_keys(familys) as relations,age
from t_family;
+-----+-----------+--------------------------------+------+--+
| id  |   name    |           relations            | age  |
+-----+-----------+--------------------------------+------+--+
| 1   | zhangsan  | ["father","mother","brother"]  | 28   |
| 2   | lisi      | ["father","mother","brother"]  | 22   |
| 3   | wangwu    | ["father","mother","sister"]   | 29   |
| 4   | mayun     | ["father","mother"]            | 26   |
+-----+-----------+--------------------------------+------+--+
-- 查出每个人的亲人名字
select id,name,map_values(familys) as f_name,age
from t_family;
+-----+-----------+-------------------------------------+------+--+
| id  |   name    |               f_name                | age  |
+-----+-----------+-------------------------------------+------+--+
| 1   | zhangsan  | ["xiaoming","xiaohuang","xiaoxu"]   | 28   |
| 2   | lisi      | ["mayun","huangyi","guanyu"]        | 22   |
| 3   | wangwu    | ["wangjianlin","ruhua","jingtian"]  | 29   |
| 4   | mayun     | ["mayongzhen","angelababy"]         | 26   |
+-----+-----------+-------------------------------------+------+--+
-- 查出每个人的亲人数量
select id,name,size(familys),age
from t_family;
+-----+-----------+------+------+--+
| id  |   name    | _c2  | age  |
+-----+-----------+------+------+--+
| 1   | zhangsan  | 3    | 28   |
| 2   | lisi      | 3    | 22   |
| 3   | wangwu    | 3    | 29   |
| 4   | mayun     | 2    | 26   |
+-----+-----------+------+------+--+
-- 查出所有拥有兄弟的人及他的兄弟是谁
--方法1
select id,name,familys['brother'],age
from t_family
where array_contains(map_keys(familys),'brother');
--方法2
select id,name,age,familys['brother']
from
(select id,name,age,map_keys(familys) as relations,familys
from t_family ) tmp
where array_contains(relations,'brother');
+-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+
| id  |   name    | age  |           relations            |                            familys                             |
+-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+
| 1   | zhangsan  | 28   | ["father","mother","brother"]  | {"father":"xiaoming","mother":"xiaohuang","brother":"xiaoxu"}  |
| 2   | lisi      | 22   | ["father","mother","brother"]  | {"father":"mayun","mother":"huangyi","brother":"guanyu"}       |
| 3   | wangwu    | 29   | ["father","mother","sister"]   | {"father":"wangjianlin","mother":"ruhua","sister":"jingtian"}  |
| 4   | mayun     | 26   | ["father","mother"]            | {"father":"mayongzhen","mother":"angelababy"}                  |
+-----+-----------+------+--------------------------------+----------------------------------------------------------------+--+
+-----+-----------+------+---------+--+
| id  |   name    | age  |   _c3   |
+-----+-----------+------+---------+--+
| 1   | zhangsan  | 28   | xiaoxu  |
| 2   | lisi      | 22   | guanyu  |
+-----+-----------+------+---------+--+

3.struct类型

--3.struct类型
--建表
create table t_user(id int,name string,info struct<age:int,sex:string,addr:string>)
row format delimited fields terminated by ','
collection items terminated by ':';
--vi user.dat
1,zhangsan,18:male:深圳
2,lisi,28:female:北京
3,wangwu,38:male:广州
4,赵六,26:female:上海
5,钱琪,35:male:杭州
6,王八,48:female:南京
--导数据
load data local inpath '/root/hivetest/user.dat' into table t_user;
--查询每个人的id,name,和地址
select id,name,info.addr
from t_user;
+-----+-----------+-------+--+
| id  |   name    | addr  |
+-----+-----------+-------+--+
| 1   | zhangsan  | 深圳    |
| 2   | lisi      | 北京    |
| 3   | wangwu    | 广州    |
| 4   | 赵六        | 上海    |
| 5   | 钱琪        | 杭州    |
| 6   | 王八        | 南京    |
+-----+-----------+-------+--+

hive 的函数

字符串函数、类型转换函数、数学运算函数、时间函数、表生成函数

--字符串函数
select substr("abcgfik",1,3); --用常量测试函数
select substr("abcgfik",0,3);
+------+--+
| _c0  |
+------+--+
| abc  |
+------+--+
--结果一致

表生成函数

--表生成函数（explode()    lateral view的使用）
--wordcount示例
--建表
create table t_wc(sentence string);
--vi word.dat
select name string name
name select from id info
select select id id name info
string from id
id string name
--导数据
load data local inpath '/root/hivetest/word.dat' into table t_wc;
--查询
select  explode(split(sentence,' ')) as word --按空格切割，形成数组，利用行转列函数explode()炸开
from t_wc;
+---------+--+
|  word   |
+---------+--+
| select  |
| name    |
| string  |
| name    |
| name    |
| select  |
| from    |
| id      |
| info    |
| select  |
| select  |
| id      |
| id      |
| name    |
| info    |
| string  |
| from    |
| id      |
| id      |
| string  |
| name    |
+---------+--+
select word,count(1) as conts
from
(
select  explode(split(sentence,' ')) as word
from t_wc
) tmp
group by word;
+---------+--------+--+
|  word   | conts  |
+---------+--------+--+
| from    | 2      |
| id      | 5      |
| info    | 2      |
| name    | 5      |
| select  | 4      |
| string  | 3      |
+---------+--------+--+

lateral view

--lateral view
--建表
create table t_subject(id int,name string,subjects array<string>)
row format delimited fields terminated by ' '
collection items terminated by ',';
--vi subjects.txt
001 zhangsan 物理,化学,数学,英语,生物
002 lisi 生物,历史,数学,语文,化学
003 wangwu 体育,美术,化学,语文,英语,数学
--导数据
load data local inpath '/root/hivetest/subject.txt' into table t_subject;
--查询有哪些课程
select distinct tmp.sub
from(
select explode(subjects) as sub 
from t_subject
) tmp;
+----------+--+
| tmp.sub  |
+----------+--+
| 体育       |
| 化学       |
| 历史       |
| 数学       |
| 物理       |
| 生物       |
| 美术       |
| 英语       |
| 语文       |
+----------+--+
--lateral view 示例(lateral view将explode函数的结果生成新表)
select id,name,tmp.sub
from t_subject
lateral view
explode(subjects) tmp as sub ;
+-----+-----------+----------+--+
| id  |   name    | tmp.sub  |
+-----+-----------+----------+--+
| 1   | zhangsan  | 物理       |
| 1   | zhangsan  | 化学       |
| 1   | zhangsan  | 数学       |
| 1   | zhangsan  | 英语       |
| 1   | zhangsan  | 生物       |
| 2   | lisi      | 生物       |
| 2   | lisi      | 历史       |
| 2   | lisi      | 数学       |
| 2   | lisi      | 语文       |
| 2   | lisi      | 化学       |
| 3   | wangwu    | 体育       |
| 3   | wangwu    | 美术       |
| 3   | wangwu    | 化学       |
| 3   | wangwu    | 语文       |
| 3   | wangwu    | 英语       |
| 3   | wangwu    | 数学       |
+-----+-----------+----------+--+

hive查询的相关示例

hive基本查询

hive 条件查询

关联查询

1.内连接

2.左连接

3.右连接

4.全外连接

5.左半连接（hive独有）

group by 分组聚合查询

分组聚合综合示例

hive 子查询

hive 数据类型

1.array数组类型

2.map类型

3.struct类型

hive 的函数

表生成函数

lateral view

热门文章

最新文章

相关课程

相关电子书

相关实验场景

探索云世界

热门

云计算

大数据

云原生

人工智能

数据库

开发与运维

活动广场

任务中心

训练营

直播

乘风者计划

下载

镜像站

技术资料

hive查询的相关示例

hive基本查询

hive 条件查询

关联查询

1.内连接

2.左连接

3.右连接

4.全外连接

5.左半连接（hive独有）

group by 分组聚合查询

分组聚合综合示例

hive 子查询

hive 数据类型

1.array数组类型

2.map类型

3.struct类型

hive 的函数

表生成函数

lateral view

热门文章

最新文章

相关课程

相关电子书

相关实验场景