hive
hive数据结构
●整数类：tinyint smallint int bigint
●浮点型：float double decimal
●字符串：varchar char string
●布尔型：boolean
●二进制：binary
●复合型：array struct map
●时间型：date timestamp
hive建表语句
create [external] table if not exists table_name #external:内外表
(
col1_name col1_type comment 'col1'，
col2_name col2_type comment 'col2'
)
comment '表注释'
partitioned by (pt1 pt1_type,pt2 pt2_type,...)#表分区
clustered by(col1_name) sorted by (col1_name) into ? buckets #表分桶，需要设置set hive.enforce.bucketing=true
row format delimited fields terminated by '/t' #字段分隔符
collection items terminated by '-' # array/map/struct分隔符
map keys terminated by ':' mapK和V分隔符
stored as textfile/orc #文件存储形式
location ''#定义文件位置，外表
talproperties('skip.header.line.count'='1') #设置存在表头数据情况

建表实操
create table test(
id int,
name string,
age int,
address struct,
family map,
interest array
)
row format delimited fields terminated by ','
collection items terminated by '-'
map keys terminated by ':'
stored as textfile;

制作数据
vi /opt/software/files/test.txt
1,tom,11,shandong-jinan-sankongqiao,chenqiong:wife-pang:mother-li:father,basketball-sleep-games
2,tom,11,shandong-jinan-sankongqiao,chenqiong:wife-pang:mother-li:father,basketball-sleep-games
3,tom,11,shandong-jinan-sankongqiao,chenqiong:wife-pang:mother-li:father,basketball-sleep-games
4,tom,11,shandong-jinan-sankongqiao,chenqiong:wife-pang:mother-li:father,basketball-sleep-games
5,tom,11,shandong-jinan-sankongqiao,chenqiong:wife-pang:mother-li:father,basketball-sleep-games

将数据加载到表中
load data local inpath '/opt/software/files/test.txt' into table test;

查询数据
select id,name,age, address.province,family['chenqiong'],interest[1] from test;

struct通过列名添加.的形式访问 address.province

map通过key访问 family['chenqiong']

array通过位置访问 array[0]

hive分区表
●创建分区表

创建内表
create table user_info(
id int,
name string,
age int
)
partitioned by (dt string)
row format delimited fields terminated by ','
stored as textfile;

●数据导入分区表load
本地load为复制； hdfs上load方法为移动操作
内表
数据集如下：user_info.txt
1,tom,11
2,jerry,12
3,black,13
4,james,14
5,booker,16

内表方案1：
新建分区文件夹
hdfs dfs -mkdir
/user/hive/warehouse/test.db/user_info/dt=20240317#如果为外部表
的话直接user_info/20240317即可，不同添加dt=20240317
上传文件到hdfs
hdfs dfs -put /opt/software/files/user_info.txt
/user/hive/warehouse/test.db/user_info/dt=20240317/
修复分区---专门针对内表
msck repair table user_info;
内表方案2：
新建分区文件夹
hdfs dfs -mkdir
/user/hive/warehouse/test.db/user_info/dt=20240318#如果为外部表
的话直接user_info/20240317即可，不同添加dt=20240318
上传文件到hdfs
hdfs dfs -put /opt/software/files/user_info.txt
/user/hive/warehouse/test.db/user_info/dt=20240317/
alter方式增加分区
alter table user_info add partition(dt=20240318) location
'/user/hive/warehouse/test.db/user_info/dt=20240318'

外表
创建外表
create external table user_info_external(
id int,
name string,
age int
)
partitioned by (dt string)
row format delimited fields terminated by ','
stored as textfile
location '/user/test/user_info_external'
;

数据集如下：user_info.txt
1,tom,11
2,jerry,12
3,black,13
4,james,14
5,booker,16

外表方案
新建分区文件夹
hdfs dfs -mkdir
/user/test/user_info_external/20240317#如果为外部表
上传文件到hdfs
hdfs dfs -put /opt/software/files/user_info.txt
/user/test/user_info_external/20240317
alter方式增加分区
alter table user_info_external add partition(dt=20240317)
location '/user/test/user_info_external/20240317'

insert数据插入

自动分区

set hive.exec.dynamic.partition.mode=nonstrict

配置文件 < 命令行参数 < 参数声明此处在DB中属于参数声明

quanjupeizhi :

insert into table user_info partition(dt)
select id,name,age,'20240319' as dt from user_info where dt='20240318';

手动分区

insert into table user_info partition(dt='20240319')
select id,name,age from user_info where dt='20240318';

hive分桶表
●创建分桶表
set hive.enforce.bucketing=true;
create table user_info_bucket(
id int,
name string,
age int
)
clustered by(id) sorted by(id) into 2 buckets
row format delimited fields terminated by ','
stored as textfile;

●数据导入分桶表

将数据导入一个非分桶表中
insert into/overwrite table user_info_bucket
select id,name,age from user_info where dt='20240318' cluster by (id);
注意两个表表字段类型一致。

DDL语法
●库相关
1.show databases;
2.desc database extended huawei;
●表相关
1.show tables in database_name;如：show tables in huawei;
2.desc extended userinfo;
3.alter table user_info change to user_info_new;
4.show functions;
●分区相关
1.alter table user_info add partition(dt='20240318') location'';
2.alter table user_info drop partition(dt='20240318');
3.alter table user_info partition(dt='20240318') rename to partition(dt='20240313')
●列相关
1.alter table user_info add|replace columns(country string) 新增列
2.alter table user_info change name name_1 string修改列名
3.alter table user_info change name name_1 string after age;
4.alter table user_info change name name1 string first;
多重插入
from user_info
insert into table_1
select id,name
insert into table_2
select id,age

导出表
insert overwrite local directory 'path'
select * from table;
数据写入文件系统进行了文本序列化;默认分隔符为^a,换行符为\n;如果不添加local，导出到hdfs;

注意：此处采用了overwrite,目录千万注意不要有重要数据，不能为根目录等重要目录。
排序相关函数问题*重要
1.order by：默认全局排序，尽量不要使用
2.sort by：如果set mapreduce.job.reduces>1时，按照分区排序，各个分区排序
3.distribute by：将数据发往哪个分区，默认hash算法
4.cluster by：将数据发往分区后，进行排序，如果distribute by col+sort by col中二者列名一致，则这两个作用等同于cluster by
开窗函数
1.rank()over()：排序，如果数据相同，则重复排名，下一个直接跳过
001 小明 99 1
002 小陈 99 1
003 小李 90 3
2.row_number()over()：排序，不考虑数据相同
001 小明 99 1
002 小陈 99 2
003 小李 90 3
3.dense_rank()over(）：排序，如果数据相同，则重复排名，下一个继续
001 小明 99 1
002 小陈 99 1
003 小李 90 2
4.sum()over/min()over()/max()over()/avg()over
Transform UDF脚本

建表

create table grades(
id int,name string,subject string,score int
)
row format delimited fields terminated by '\t'
lines terminated by '\n'
stored as textfile;

插入数据
insert into grades values
(0 ,jack, math, 100),
(1 ,tom, math, 120),
(2 ,james, math, 130),
(3 ,jerry, math, 110),
(4 ,anny, math, 105),
(5 ,jack, chinese, 120),
(6 ,tom, chinese, 110),
(7 ,anny, chinese, 105),
(8 ,jerry, chinese, 100),
(9 ,james, chinese, 90),
(10 ,curry, chinese, 66),
(11 ,jack, english, 90)
(12 ,tom, english, 90),
(13 ,james, english, 99),
(14 ,jerry, english, 89),
(15 ,anny, english, 89);

编写脚本计算个人平均成绩，总成绩
avg(values),sum(values) group by name

test_transform.py

import sys

user_info = {}
for line in sys.stdin:
(id, name, subject, score) = line.strip('\n').split('\t')
if user_info.get(name):
user_info[name].append(int(score))
else:
user_info[name] = [int(score)]
for key, value in user_info.items():
sys.stdout.write(key + '\t' + str(sum(value)) + '\t' + str(round(sum(value) / len(value), 2)) + '\n')

添加脚本
add file /opt/software/hive/scripts/test_transform.py#如果是集群，添加到集群中
add file hdfs://master:9000/scripts/
select transform(id,name,subject,score) using 'python test_transform.py' as(name,sum_score,avg_score) from grades;

问题？
1.注意表分隔符一定要匹配和split，目前只验证了\t分隔符，注意行分隔符,|作为分隔符报错。
2.输出使用sys.stdout.write()#输出全为字符串格式，分隔符同上

验证方案
cat test.txt|python test_transform.py

hive函数
desc function 函数名称；查看函数使用
数学函数
函数释义
(+ - * /) 加减乘除
round 四舍五入
trunc 截取长度
ceil 向上取整
floor 向下取整
pow 取乘方
pmod 求模
字符函数
函数释义
lower 转小写
upper 转大写
length 字符串长度
concat 拼接
substr 截取子字符串
trim 去前后空格
get_json_object 从json字符串解析数据
转换函数
函数释义
cast 转换字段类型
日期函数
函数释义
year 获取年
month 获取月
day 获取日
to_date 转字符串为日期
current_date 当前日期
current_timestamp 当前时间戳
条件函数
函数释义
case...when... case A when B then C when D then E else F end
聚合函数
函数释义
count
sum
min
max
avg

hive学习笔记

struct通过列名添加.的形式访问 address.province

map通过key访问 family['chenqiong']

array通过位置访问 array[0]

自动分区

配置文件 < 命令行参数 < 参数声明此处在DB中属于参数声明

quanjupeizhi :

手动分区

注意两个表表字段类型一致。

建表

test_transform.py

热门文章

最新文章

相关课程

相关电子书

相关实验场景

探索云世界

热门

云计算

大数据

云原生

人工智能

数据库

开发与运维

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

直播

下载

镜像站

技术资料

hive学习笔记

struct通过列名添加.的形式访问 address.province

map通过key访问 family['chenqiong']

array通过位置访问 array[0]

自动分区

配置文件 < 命令行参数 < 参数声明 此处在DB中属于参数声明

quanjupeizhi :

手动分区

注意两个表表字段类型一致。

建表

test_transform.py

热门文章

最新文章

相关课程

相关电子书

相关实验场景

配置文件 < 命令行参数 < 参数声明此处在DB中属于参数声明