创建分区表:
create table dept_partition(
deptno string,
dname string,
loc string
)
partitioned by(udate string comment 'this table is partitioned by day') 注:分区字段名为自定义字段,尽量与数据字段不一样!
row format delimited fields terminated by '\t';
注意:分区字段要注明数据类型,且分区字段不在字段列表中,在Hdfs中,分区字段是以分区字段名=分区字段值的形式作为表目录下的子目录存在的
load data local inpath '/opt/module/datas/20211030.log' into table dept_partition partition(udate='10_30');
load data local inpath '/opt/module/datas/20211031.log' into table dept_partition partition(udate='10_31');
load data local inpath '/opt/module/datas/20211101.log' into table dept_partition partition(udate='11_01');
select * from dept_partition where udate='11_01';
select * from dept_partition where udate='10_30'
union
select * from dept_partition where udate='10_31';
select * from dept_partition where udate='10_30' or udate='10_31';
增加分区:
alter table dept_partition add partition(udate='11_02');
alter table dept_partition add partition(udate='11_03') partition(udate='11_04');
删除分区:
alter table dept_partition drop partition(udate='11_04');
alter table dept_partition drop partition(udate='11_02'),partition(udate='11_03');
注意:增加多个分区时,partition之间没有逗号,删除多个分区时,partition之间有逗号!
查看分区:
show partitions dept_partition;
当所有数据混杂在一个文件中,如何分区?
可以先将所有数据存入临时表,再从临时表中select insert
create table dept_partition2(
name string,
cost int,
yue double
)
partitioned by(udate string);
create table tmp(
utime string,
name string,
cost int,
yue double
)
row format delimited fields terminated by '\t';
load data local inpath '/opt/module/datas/class.txt' into table tmp;
insert into table dept_partition2 partition(udate='2019-02-03') select name,cost,yue from tmp where substring(utime,1,10)='2019-02-03';
动态分区:让程序自动识别分区字段值,将数据分别放入对应的分区中
设置:
set hive.exec.dynamic.partition=true; |
--打开动态分区功能 |
set hive.exec.dynamic.partition.mode=nonstrict; |
--将动态分区设为非严格模式 |
set hive.exec.max.dynamic.partitions=1000; |
--设置最大分区个数 |
严格模式:在所有的分区字段中,至少有一个分区字段为静态分区
非严格模式:所有的分区字段都是动态分区
静态分区:手动指定分区字段值
insert into table dept_partition2 partition(udate) select name,cost,yue,substring(utime,1,10) udate from tmp;
注意:在从临时表中select数据时,分区字段必须按顺序放在最后
二级/多级分区:
create table class_par(
name string,
cost int,
yue double
)
partitioned by(uyear string,umonth string,uday string)
row format delimited fields terminated by '\t';
将一份数据导入到 2021-10-02 分区中
load data local inpath '/opt/module/datas/class.txt' into table class_par partition(uyear='2021',umonth='10',uday='02');
insert into table class_par partition(uyear,umonth,uday) select name,cost,yue,substring(utime,1,4) uyear,month(utime) umonth,day(utime) uday from tmp;
动静结合插入数据:
insert into table class_par partition(uyear='2019',umonth,uday) select name,cost,yue,month(utime) umonth,day(utime) uday from tmp where year(utime) = '2019';
select * from class_par where uyear='2020' and umonth='10' and uday='10';
修复指令:如果刚上传的数据查不到,可以执行此命令。
msck repair table class_par;