当某张表中存在上亿条或者数据量很大的情况下,每天全量同步数据会消耗大量的资源,为此可以考虑仅仅将当天的增量数据同步到数仓中
首先需要先将历史数据全量同步到数仓b_order_df表中,之后进行增量数据同步到b_order_di表中,剩下的就是每天将同步过来的增量数据与历史全量数据进行合并的工作。
-- 通过创建时间或者更新时间筛选出业务日期中新增或者变化的输入。同步到数仓中的增量表b_order_di中
select
order_code
,external_order_code
,total_order_code
,order_type
,business_type
,order_tag
,channel_code
,terminal
,shop_code
,shop_name
,customer_id
from b_order
where (create_time >= ${bizdate} and create_time < DATE_ADD(${bizdate}, 1))
or (update_time >= ${bizdate} and update_time < DATE_ADD(${bizdate}, 1))
同步入仓完成后需要进行增量数据与全量数据合并工作
-- 将业务日期的增量数据与业务日期前一天的全量数据进行合并成为业务日期下的全量数据,并写入到业务日期的全量表中
insert overwrite table b_order_df paritition(ds = ${bizdate})
select
coalesce(di.order_code,df.order_code) as order_code
,coalesce(di.external_order_code,df.external_order_code) as external_order_code
,coalesce(di.total_order_code,df.total_order_code) as total_order_code
,coalesce(di.order_type,df.order_type) as order_type
,coalesce(di.business_type,df.business_type) as business_type
,coalesce(di.order_tag,df.order_tag) as order_tag
,coalesce(di.channel_code,df.channel_code) as channel_code
,coalesce(di.terminal,df.terminal) as terminal
,coalesce(di.shop_code,df.shop_code) as shop_code
,coalesce(di.shop_name,df.shop_name) as shop_name
,coalesce(di.customer_id,df.customer_id) as customer_id
from(
select
order_code
,external_order_code
,total_order_code
,order_type
,business_type
,order_tag
,channel_code
,terminal
,shop_code
,shop_name
,customer_id
from b_order_di
where ds = ${bizdate} -- 业务日期
)di
full join
(
select
order_code
,external_order_code
,total_order_code
,order_type
,business_type
,order_tag
,channel_code
,terminal
,shop_code
,shop_name
,customer_id
from b_order_df
where DATE_SUB(${bizdate}, 1) -- 业务日期的前一天
)df on di.order_code = df.order_code