拉链表
- 版本迭代:hive 0.14 slowly changing dimension => hive 2.6.0 merge 事务管理
- 原来采用分区表,用户分区存储历史增量数据,缺点是重复数据太多
- 定义:数仓用于解决持续增长且存在一定时间时间范围内重复的数据
- 存储:创建拉链表时使用
列式存储ORC
:
不能使用load加载数据
压缩比高 效率高 场景:【数据规模庞大】,新数据【在有限的时间】内存在多种状态变化
优点:节约空间(一份订单只有一条数据)
举例:
原始表订单: order_id,order_timestamp,user_id,order_status 1,2024_01_21 16:12:37.259,87986321,0 1,2024_01_21 16:12:47.003,87986321,1 1,2024_01_22 09:00:28.022,87986321,2 1,2024_01_24 15:00:00.123,87986321,3 1,2024_02_01 00:30:00.227,87986321,4 order_detail_id,fk_order_id,goods_id,buy_count,goods_price 拉链表订单: order_id,user_id,order_create_timestamp,order_modify_timestamp,order_amount,order_current_status 1,87986321,2024_01_21 16:12:37.259,2024_02_01 00:30:00.227,3242.66,4
配置:
set hive.support.concurrency=true;
set hive.enforce.bucketing=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.compactor.initiator.on=true; -- 表合并开启
set hive.compactor.worker.threads=1; -- 表合并线程必须为一
set hive.auto.convert.join=false; -- 关闭 mapjoin
set hive.merge.cardinality.check=false; -- 关闭检查数据列的基数(列值的差异性)
set mapreduce.job.reduces=4;
- 拉链表实例:
// 创建原始表格
create table yb12211_2.hive_zipper_order(
order_id bigint,
user_id bigint,
order_modify_dt timestamp,
order_money decimal(10,2),
current_status int
)
row format delimited fields terminated by ',';
// 将数据文件导入原始表格
load data local inpath '/root/hive/data/course/order_record.log'
overwrite into table yb12211_2.hive_zipper_order;
// 创建拉链表
// 操作历史全量数据用动态分区
set hive.support.concurrency=true;
set hive.enforce.bucketing=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.compactor.initiator.on=true;
set hive.compactor.worker.threads=1;
set hive.auto.convert.join=false;
set hive.merge.cardinality.check=false;
set mapreduce.job.reduces=4;
drop table if exists yb12211_2.hive_zipper_pc_order;
create table yb12211_2.hive_zipper_pc_order(
order_id bigint,
user_id bigint,
order_create_dt timestamp,
order_modify_dt timestamp,
order_money decimal(10,2),
current_status int
) partitioned by(year int,month int,day int)
clustered by(order_create_dt) into 4 buckets
row format delimited fields terminated by ','
stored as orc
tblproperties("transactional"="true");
// 对拉链表的数据进行聚合,获取订单信息的创建日期、修改日期和订单状态
with zip_src as (
select order_id,user_id,order_money,
min(order_modify_dt) as order_create_dt,
max(order_modify_dt) as order_modify_dt,
max(current_status) as current_status
from yb12211_2.hive_zipper_order
group by order_id,user_id,order_money
)
// 将原始数据灌入拉链表
insert overwrite table yb12211_2.hive_zipper_pc_order partition(year,month,day)
select
order_id,
user_id,
order_create_dt,
order_modify_dt,
order_money,
current_status,
year(order_create_dt) as year,
month(order_create_dt) as month,
day(order_create_dt) as day
from zip_src;
// 拉链表查询 查询之前必须先有这两句配置
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.support.concurrency=true;
select * from yb12211_2.hive_zipper_pc_order
where to_date(order_modify_dt)='2021-02-04'
order by order_modify_dt desc;
// 对于追加增量数据,将增量数据覆盖在原始数据表中
load data local inpath '/root/hive/data/course/order_record_2021_02_05.log'
overwrite into table yb12211_2.hive_zipper_order;
// 将原始数据表中的增量数据插入拉链表
// 利用源数据和目标表的order_id进行匹配,若匹配则更新现有订单信息,若不匹配则插入新订单。
merge into yb12211_2.hive_zipper_pc_order as O
using (
select
order_id,
user_id,
order_create_dt,
order_modify_dt,
order_money,
current_status,
year(order_create_dt) as year,
month(order_create_dt) as month,
day(order_create_dt) as day
from (
select order_id,user_id,order_money,
min(order_modify_dt) as order_create_dt,
max(order_modify_dt) as order_modify_dt,
max(current_status) as current_status
from yb12211_2.hive_zipper_order
group by order_id,user_id,order_money
)T
) as H
on O.order_id=H.order_id
when matched then
update set order_modify_dt=H.order_modify_dt,current_status=H.current_status
when not matched then
insert values(H.order_id,H.user_id,H.order_create_dt,H.order_modify_dt,H.order_money,H.current_status,H.year,H.month,H.day);
// 验证拉链结果:最后修改时间是否大于创建时间
select * from yb12211_2.hive_zipper_pc_order
where to_date(order_modify_dt)>to_date(order_create_dt);
- 验证数据变化的三种情况:
- 新增数据,插入原始表中的所有字段信息。
- 更改数据,更改修改时间|结束时间|数据状态。
- 删除数据:只需将结束日期改为删除当天即可。
构造增量数据
此处提供了订单日增量数据的自动生成代码,读者可利用代码实现对增量数据文件的生成,以便于体验拉链表的作用。
- HiveZipMaker类
package cn.ybg;
import java.io.*;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
public class HiveZipTableMaker {
static Properties pro = new Properties();
static Random rand = new Random();
static Calendar calendar = Calendar.getInstance(Locale.CHINA);
static SimpleDateFormat timeFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:dd.SSS");
static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
static long orderId = 0;
static long timestamp = 0;
static String order_log = "order/order_log.properties";
static String order_record = "order/order_record_2021_02_05.log";
static String order_list = "order/order_list.log";
static BufferedWriter bw;
static long size;
static LinkedList<Order> orders;
static {
try {
pro.load(new FileReader(order_log));
timestamp = Long.parseLong(pro.getProperty("timestamp"));
calendar.setTimeInMillis(timestamp);
orderId = Long.parseLong(pro.getProperty("orderId"));
size = new File(order_record).length();
bw = new BufferedWriter(new FileWriter(order_record,true));
File orderList = new File(order_list);
if (orderList.exists() && orderList.length()>0){
ObjectInputStream ois = new ObjectInputStream(new FileInputStream(orderList));
orders = (LinkedList<Order>) ois.readObject();
ois.close();
}else{
orders = new LinkedList<>();
}
} catch (Exception e) {
e.printStackTrace();
System.exit(-1);
}
}
static int userId(){
return rand.nextInt(10000000)+1;
}
static long orderId(){
return ++orderId;
}
static Date now(){
calendar.add(Calendar.MILLISECOND,rand.nextInt(5000));
return calendar.getTime();
}
static void write(String line) throws IOException {
if (orderId>1){
bw.newLine();
}
bw.write(line);
}
static void close(){
try {
bw.close();
pro.setProperty("timestamp",String.valueOf(timestamp));
pro.setProperty("orderId",String.valueOf(orderId));
pro.store(new FileWriter(order_log,false),timeFormat.format(new Date()));
if (orders.size()>0){
ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(order_list,false));
oos.writeObject(orders);
oos.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
static double randMoney(){
return new BigDecimal(rand.nextInt(500000)/100+10)
.setScale(2, RoundingMode.HALF_UP)
.doubleValue();
}
static boolean percentage(){
return rand.nextInt(100)<65;
}
public static void main(String[] args) throws IOException, ParseException {
/**
* 订单编号:
* 用户编号:
* 操作日期:
* 订单金额:
* 当前状态:0(suspending),1(paid),2(signed),3(completed),4(return)
*/
final long TWO_WEEKS = 14*24*60*60*1000;
String format = dateFormat.format(now());
Order order;
Date now;
Date dateEnd = dateFormat.parse("2021-02-06");
while ((now = now()).before(dateEnd)){
timestamp = now.getTime();
String nowFormat = dateFormat.format(now);
if (!format.equals(nowFormat)){
format = nowFormat;
if (orders.size()>0) {
Date finalNow = now;
orders.removeIf(o2-> finalNow.getTime()-o2.getModifyDate().getTime()>TWO_WEEKS);
orders.remove(rand.nextInt(orders.size()));
}
}
if (percentage() && orders.size()>=20+rand.nextInt(21)) {
for (int j = 0; j < rand.nextInt(orders.size()/15) ; j++){
int index = rand.nextInt(orders.size());
order = orders.get(index);
if (order.getStatus()<2){
order.setStatus(order.getStatus()+1);
}else{
order.setStatus(percentage()?3:4);
orders.remove(index);
}
order.setModifyDate(now);
write(order.toString());
}
}else{
orderId = orderId();
int userId = userId();
double money = randMoney();
order = new Order(orderId, userId, now, money, 0);
orders.add(order);
write(order.toString());
}
}
close();
}
}
- Order类
package cn.ybg;
import java.io.Serializable;
import java.text.SimpleDateFormat;
import java.util.Date;
/**
* 订单编号:
* 用户编号:
* 操作日期:
* 订单金额:
* 当前状态:0(suspending),1(paid),2(signed),3(completed),4(return)
*/
public class Order implements Serializable {
static SimpleDateFormat timeFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:dd.SSS");
private long orderId;
private int userId;
private Date modifyDate;
private double money;
private int status;
public Order(long orderId, int userId, Date modifyDate, double money, int status) {
this.orderId = orderId;
this.userId = userId;
this.modifyDate = modifyDate;
this.money = money;
this.status = status;
}
public long getOrderId() {
return orderId;
}
public void setOrderId(long orderId) {
this.orderId = orderId;
}
public int getUserId() {
return userId;
}
public void setUserId(int userId) {
this.userId = userId;
}
public Date getModifyDate() {
return modifyDate;
}
public void setModifyDate(Date modifyDate) {
this.modifyDate = modifyDate;
}
public double getMoney() {
return money;
}
public void setMoney(double money) {
this.money = money;
}
public int getStatus() {
return status;
}
public void setStatus(int status) {
this.status = status;
}
@Override
public String toString() {
return String.format("%d,%d,%s,%.2f,%d",orderId,userId,timeFormat.format(modifyDate),money,status);
}
}