Apache Kylin1.5.2.1之订单案例详细构建流程

简介: 一.Hive订单数据仓库构建 1. 创建事实表并插入数据 DROP TABLE IF EXISTS default.fact_order ; create table default.

一.Hive订单数据仓库构建

1. 创建事实表并插入数据

DROP TABLE IF EXISTS default.fact_order ;
create table default.fact_order (
time_key string,
product_key string,
salesperson_key string,
custom_key string,
quantity_ordered bigint,
order_dollars bigint,
cost_dollars bigint
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;

load data local inpath '/root/kylinsample/fact_order.txt' overwrite into table default.fact_order;

##load data local inpath '/root/kylinsample/fact_order.txt'  into table default.fact_order;


fact_order.txt

2016-05-01,pd001,sp001,ct001,100,2000,1000
2016-05-01,pd001,sp002,ct002,100,2000,1000
2016-05-01,pd001,sp003,ct002,100,2000,1000
2016-05-01,pd002,sp002,ct002,100,2000,1000
2016-05-01,pd003,sp003,ct001,100,2000,1000
2016-05-01,pd001,sp003,ct001,100,2000,1000
2016-05-01,pd001,sp002,ct001,100,2000,1000
2016-05-01,pd001,sp003,ct002,100,2000,1000
2016-05-01,pd002,sp001,ct001,100,2000,1000
2016-05-01,pd003,sp001,ct001,100,2000,1000
2016-05-01,pd004,sp001,ct001,50,1000,600
2016-05-02,pd001,sp001,ct001,50,1000,600
2016-05-02,pd001,sp002,ct002,100,2000,1000
2016-05-02,pd001,sp003,ct002,100,2000,1000
2016-05-02,pd002,sp001,ct001,50,1000,600
2016-05-02,pd003,sp001,ct001,50,1000,600
2016-05-02,pd004,sp001,ct001,50,1000,600
2016-05-03,pd001,sp001,ct001,50,1000,600
2016-05-03,pd001,sp002,ct002,100,2000,1000
2016-05-03,pd001,sp003,ct002,100,2000,1000
2016-05-04,pd002,sp001,ct001,700,14000,10000
2016-05-04,pd003,sp001,ct001,700,14000,10000
2016-05-04,pd004,sp001,ct001,100,2000,1000
2016-05-05,pd001,sp001,ct001,100,2000,1000
2016-05-05,pd001,sp002,ct002,700,14000,10000
2016-05-05,pd001,sp003,ct002,700,14000,10000
2016-05-05,pd002,sp001,ct001,100,2000,1000
2016-05-05,pd003,sp001,ct001,100,2000,1000
2016-05-05,pd004,sp001,ct001,100,2000,1000
2016-05-06,pd001,sp001,ct001,100,2000,1000
2016-05-06,pd001,sp002,ct002,100,2000,1000
2016-05-06,pd001,sp003,ct002,100,2000,1000
2016-05-07,pd002,sp001,ct001,100,2000,1000
2016-05-07,pd003,sp001,ct001,100,2000,1000
2016-05-07,pd004,sp001,ct001,50,1000,600
2016-05-07,pd002,sp001,ct001,100,2000,1000
2016-05-07,pd003,sp001,ct001,100,2000,1000
2016-05-07,pd004,sp001,ct001,50,1000,600
2016-05-08,pd001,sp001,ct001,50,1000,600
2016-05-08,pd001,sp002,ct002,100,2000,1000
2016-05-08,pd001,sp003,ct002,100,2000,1000
2016-05-08,pd001,sp001,ct001,50,1000,600
2016-05-08,pd001,sp002,ct002,100,2000,1000
2016-05-08,pd001,sp003,ct002,100,2000,1000
2016-05-08,pd001,sp001,ct001,50,1000,600
2016-05-08,pd001,sp002,ct002,100,2000,1000
2016-05-08,pd001,sp003,ct002,100,2000,1000
2016-05-09,pd002,sp001,ct001,50,1000,600
2016-05-09,pd003,sp001,ct001,50,1000,600
2016-05-09,pd004,sp001,ct001,50,1000,600
2016-05-09,pd001,sp001,ct001,50,1000,600
2016-05-09,pd002,sp001,ct001,50,1000,600
2016-05-09,pd003,sp001,ct001,50,1000,600
2016-05-09,pd004,sp001,ct001,50,1000,600
2016-05-09,pd001,sp001,ct001,50,1000,600
2016-05-09,pd001,sp002,ct002,100,2000,1000
2016-05-09,pd004,sp003,ct002,100,2000,1000
2016-05-09,pd002,sp001,ct001,700,14000,10000
2016-05-09,pd003,sp003,ct001,700,14000,10000
2016-05-09,pd004,sp003,ct001,100,2000,1000
2016-05-10,pd001,sp001,ct001,100,2000,1000
2016-05-10,pd001,sp002,ct002,700,14000,10000
2016-05-10,pd001,sp003,ct002,700,14000,10000
2016-05-10,pd002,sp001,ct001,100,2000,1000
2016-05-11,pd003,sp003,ct001,100,2000,1000
2016-05-11,pd004,sp001,ct001,100,2000,1000
2016-05-12,pd001,sp001,ct001,100,2000,1000
2016-05-12,pd004,sp002,ct002,100,2000,1000
2016-05-12,pd001,sp003,ct002,100,2000,1000
2016-05-12,pd001,sp001,ct001,100,2000,1000
2016-05-12,pd004,sp002,ct002,100,2000,1000
2016-05-12,pd001,sp003,ct002,100,2000,1000
2016-05-13,pd002,sp001,ct001,100,2000,1000
2016-05-13,pd003,sp001,ct001,100,2000,1000
2016-05-13,pd004,sp001,ct001,50,1000,600
2016-05-14,pd001,sp001,ct001,50,1000,600
2016-05-14,pd001,sp002,ct002,100,2000,1000
2016-05-14,pd001,sp003,ct002,100,2000,1000
2016-05-15,pd002,sp001,ct001,50,1000,600
2016-05-15,pd003,sp001,ct001,50,1000,600
2016-05-15,pd004,sp001,ct001,50,1000,600
2016-05-15,pd002,sp001,ct001,50,1000,600
2016-05-15,pd003,sp001,ct001,50,1000,600
2016-05-15,pd004,sp001,ct001,50,1000,600
2016-05-15,pd002,sp001,ct001,50,1000,600
2016-05-15,pd003,sp001,ct001,50,1000,600
2016-05-15,pd004,sp001,ct001,50,1000,600
2016-05-16,pd001,sp001,ct001,50,1000,600
2016-05-16,pd001,sp002,ct002,100,2000,1000
2016-05-16,pd001,sp003,ct002,100,2000,1000
2016-05-16,pd001,sp001,ct001,50,1000,600
2016-05-16,pd001,sp002,ct002,100,2000,1000
2016-05-16,pd001,sp003,ct002,100,2000,1000
2016-05-17,pd002,sp001,ct001,700,14000,10000
2016-05-17,pd003,sp001,ct001,700,14000,10000
2016-05-17,pd004,sp001,ct001,100,2000,1000
2016-05-17,pd002,sp001,ct001,700,14000,10000
2016-05-17,pd003,sp001,ct001,700,14000,10000
2016-05-17,pd004,sp001,ct001,100,2000,1000
2016-05-18,pd001,sp001,ct001,100,2000,1000
2016-05-18,pd003,sp002,ct001,700,14000,10000
2016-05-18,pd001,sp003,ct002,700,14000,10000
2016-05-19,pd002,sp001,ct001,100,2000,1000
2016-05-19,pd003,sp001,ct002,100,2000,1000
2016-05-20,pd001,sp001,ct001,100,2000,1000
2016-05-20,pd002,sp002,ct002,100,2000,1000
2016-05-20,pd003,sp003,ct001,100,2000,1000
2016-05-20,pd004,sp001,ct001,100,2000,1000
2016-05-20,pd001,sp002,ct002,100,2000,1000
2016-05-20,pd002,sp001,ct002,100,2000,1000

 
2. 创建天维度表dim_day

DROP TABLE IF EXISTS default.dim_day ;

create table default.dim_day (
day_key string,
full_day string,
month_name string,
quarter string,
year string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
load data local inpath '/root/kylinsample/dim_day.txt' overwrite into table default.dim_day;

 

 dim_day.txt
 
2016-05-01,2016-05-01,201605,2016q2,2016
2016-05-02,2016-05-02,201605,2016q2,2016
2016-05-03,2016-05-03,201605,2016q2,2016
2016-05-04,2016-05-04,201605,2016q2,2016
2016-05-05,2016-05-05,201605,2016q2,2016
2016-05-06,2016-05-06,201605,2016q2,2016
2016-05-07,2016-05-07,201605,2016q2,2016
2016-05-08,2016-05-08,201605,2016q2,2016
2016-05-09,2016-05-09,201605,2016q2,2016
2016-05-10,2016-05-10,201605,2016q2,2016
2016-05-11,2016-05-11,201605,2016q2,2016
2016-05-12,2016-05-12,201605,2016q2,2016
2016-05-13,2016-05-13,201605,2016q2,2016
2016-05-14,2016-05-14,201605,2016q2,2016
2016-05-15,2016-05-15,201605,2016q2,2016
2016-05-16,2016-05-16,201605,2016q2,2016
2016-05-17,2016-05-17,201605,2016q2,2016
2016-05-18,2016-05-18,201605,2016q2,2016
2016-05-19,2016-05-19,201605,2016q2,2016
2016-05-20,2016-05-20,201605,2016q2,2016

 
 
3. 创建售卖员的维度表salesperson_dim
 
DROP TABLE IF EXISTS default.dim_salesperson ;
 
create table default.dim_salesperson (
salesperson_key string,
salesperson string,
salesperson_id string,
region string,
region_code string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
 
load data local inpath '/root/kylinsample/dim_salesperson.txt' overwrite into table default.dim_salesperson;
  
 dim_salesperson.txt
 
sp001,hongbin,sp001,beijing,10086
sp002,hongming,sp002,beijing,10086
sp003,hongmei,sp003,beijing,10086

 

4. 创建客户维度 custom_dim

 
 DROP TABLE IF EXISTS default.dim_custom ;
 
create table default.dim_custom (
custom_key string,
custom_name string,
custorm_id string,
headquarter_states string,
billing_address string,
billing_city string,
billing_state string,
industry_name string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
 
load data local inpath '/root/kylinsample/dim_custom.txt' overwrite into table default.dim_custom;

 dim_custom.txt
 
ct001,custom_john,ct001,beijing,zgx-beijing,beijing,beijing,internet                   
ct002,custom_herry,ct002,henan,shlinjie,shangdang,henan,internet     
 
 
 
 
5. 创建产品维度表并插入数据
 
 DROP TABLE IF EXISTS default.dim_product ;                                             
                                                                                         
create table default.dim_product (                                                     
product_key string,                                                                
product_name string,                                                               
product_id string,                                                                 
product_desc string,                                                               
sku string,                                                                        
brand string,                                                                      
brand_code string,                                                                 
brand_manager string,                                                              
category string,                                                                   
category_code string                                                               
)                                                                                      
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','                                          
STORED AS TEXTFILE;                                                                    
                      
load data local inpath '/root/kylinsample/dim_product.txt' overwrite into table default.dim_product;     
 dim_product.txt
 
pd001,Box-Large,pd001,Box-Large-des,large1.0,brand001,brandcode001,brandmanager001,Packing,cate001
pd002,Box-Medium,pd001,Box-Medium-des,medium1.0,brand001,brandcode001,brandmanager001,Packing,cate001
pd003,Box-small,pd001,Box-small-des,small1.0,brand001,brandcode001,brandmanager001,Packing,cate001
pd004,Evelope,pd001,Evelope_des,large3.0,brand001,brandcode001,brandmanager001,Pens,cate002

 
这样一个星型的结构表在hive中创建完毕, 实际上一个离线的数据仓库已经完成, 它包含一个主题, 即商品订单.



三.Kylin的Project创建与数据同步
1.单击"Manage Project"
2.单击"New Project"
3.输入"Project Name", WareHouse_01
4.Submit


1.选择WareHouse_01,选择"Data Source" tab页
2.单击"Load Hive Table"
3.输入需要同步的表
  "DEFAULT.FACT_ORDER,DEFAULT.DIM_DAY,DEFAULT.DIM_PRODUCT,DEFAULT.DIM_SALESPERSON,DEFAULT.DIM_CUSTOM"
4.Sync

四.Kylin的Model创建
1.选择"Models" tab页,单击"New Model"
2."Model Name"输入,WareHouse_01_Model
3.选择"Fact Table"为 DEFAULT.FACT_ORDER;再 添加Lookup Table;
4.选取每张表的哪些列字段作为Dimensions
 ID Table Name           Columns
 1 DEFAULT.FACT_ORDER  TIME_KEY PRODUCT_KEY SALESPERSON_KEY CUSTOM_KEY
 2 DEFAULT.DIM_DAY          FULL_DAY
 3 DEFAULT.DIM_PRODUCT  PRODUCT_NAME
 4 DEFAULT.DIM_SALESPERSON  SALESPERSON
 5 DEFAULT.DIM_CUSTOM  CUSTOM_NAME

5.选取DEFAULT.FACT_ORDER表的哪些列字段作为measures
        QUANTITY_ORDERED ORDER_DOLLARS COST_DOLLARS

6.a.选取 "Partition Date Column"为DEFAULT.FACT_ORDER.TIME_KEY,格式 yyyy-MM-dd
  b.对于"Filter"条件,由于没有要过滤的条件,故不填写

7.Save


五.Kylin的Cube创建

 

1.选择"Models" tab页,单击"New Cube“

2.Cube Info:
          "Model Name"选择,WareHouse_01_Model
           "Cube Name"输入,cube01

3.Dismensions:
          单击"Auto Generator",依据情况选择维度的列,全选

4.Measures:
          a.单击"+Measure",添加要聚合计算的度量,比如 sum(QUANTITY_ORDERED)
          b.Expression: SUM/MIN/MAX/COUNT/COUNT_DISTINCT/TOP_N/RAW
5.Refresh Setting:
          a.Auto Merge Thresholds,自动合并阈值,7~28 days
   b.Retention Threshold,保留天数,60
   c.Partition Start Date,非常重要,是后面build cube的开始日期

6.Advanced Setting:
        --Aggregation Groups:
   a.Includes: TIME_KEY ,PRODUCT_KEY ,SALESPERSON_KEY , CUSTOM_KEY
   b.Mandatory Dimensions: TIME_KEY
   c.Hierarchy Dimensions: PRODUCT_KEY ,SALESPERSON_KEY ,CUSTOM_KEY
   d.Joint Dimensions: 无
       --Rowkeys:
 TIME_KEY ,PRODUCT_KEY ,SALESPERSON_KEY ,CUSTOM_KEY 4个字段为dict字典编码
 
7.Configuration Overwrites: 无

8.Overview:
          保存cube


五.Cube Build

1.选择 cube01,单击”Action”,选择Build

2.填写End Date,Submit

3.单击”Monitor”,观察Job

4.等待Process100% (Any Errors)

5.返回cube01,查看 cube size 和 Source Records等字段更新

 
六.Hive* kyin 查询对比

点击(此处)折叠或打开

  1. 1.2016-05-01到2016-05-15期间的每天的订单数量,订单金额,订单成本

  2. Hive: 65.816 s
  3. select fact.time_key, sum(fact.quantity_ordered), sum(fact.order_dollars), sum(fact.cost_dollars) from fact_order as fact
  4. where fact.time_key >= "2016-05-01" and fact.time_key <= "2016-05-15"
  5. group by fact.time_key order by fact.time_key;

  6. Kylin: 0.32s-->0.27s 
  7. select fact.time_key, sum(fact.quantity_ordered), sum(fact.order_dollars), sum(fact.cost_dollars) from fact_order as fact
  8. where fact.time_key between '2016-05-01' and '2016-05-15'
  9. group by fact.time_key order by fact.time_key

点击(此处)折叠或打开

  1. 2.2016-05-01到2016-05-15期间的每天的产品的订单量

  2. Hive: 100.336s
  3. select dday.full_day,dsp.product_name, sum(fact.quantity_ordered) from fact_order as fact
  4. inner join dim_day as dday on fact.time_key = dday.day_key
  5. inner join dim_product as dsp on fact.product_key = dsp.product_key
  6. where dday.full_day >= "2016-05-01" and dday.full_day <= "2016-05-15"
  7. group by dday.full_day,dsp.product_name
  8. order by dday.full_day,dsp.product_name;

  9. Kylin:0.93s-->0.39s
  10. select dday.full_day,dsp.product_name, sum(fact.quantity_ordered) from fact_order as fact
  11. inner join dim_day as dday on fact.time_key = dday.day_key
  12. inner join dim_product as dsp on fact.product_key = dsp.product_key
  13. where dday.full_day >= '2016-05-01' and dday.full_day <= '2016-05-15'
  14. group by dday.full_day,dsp.product_name
  15. order by dday.full_day,dsp.product_name





 

. Hive ? Kylin查询对

目录
相关文章
|
2月前
|
消息中间件 数据挖掘 Kafka
Apache Kafka流处理实战:构建实时数据分析应用
【10月更文挑战第24天】在当今这个数据爆炸的时代,能够快速准确地处理实时数据变得尤为重要。无论是金融交易监控、网络行为分析还是物联网设备的数据收集,实时数据处理技术都是不可或缺的一部分。Apache Kafka作为一款高性能的消息队列系统,不仅支持传统的消息传递模式,还提供了强大的流处理能力,能够帮助开发者构建高效、可扩展的实时数据分析应用。
102 5
|
2月前
|
消息中间件 存储 监控
构建高可用性Apache Kafka集群:从理论到实践
【10月更文挑战第24天】随着大数据时代的到来,数据传输与处理的需求日益增长。Apache Kafka作为一个高性能的消息队列服务,因其出色的吞吐量、可扩展性和容错能力而受到广泛欢迎。然而,在构建大规模生产环境下的Kafka集群时,保证其高可用性是至关重要的。本文将从个人实践经验出发,详细介绍如何构建一个高可用性的Kafka集群,包括集群规划、节点配置以及故障恢复机制等方面。
99 4
|
3月前
|
消息中间件 分布式计算 大数据
大数据-166 Apache Kylin Cube 流式构建 整体流程详细记录
大数据-166 Apache Kylin Cube 流式构建 整体流程详细记录
93 5
|
3月前
|
存储 SQL 分布式计算
大数据-162 Apache Kylin 全量增量Cube的构建 Segment 超详细记录 多图
大数据-162 Apache Kylin 全量增量Cube的构建 Segment 超详细记录 多图
72 3
|
2月前
|
存储 数据挖掘 数据处理
巴别时代使用 Apache Paimon 构建 Streaming Lakehouse 的实践
随着数据湖技术的发展,企业纷纷探索其优化潜力。本文分享了巴别时代使用 Apache Paimon 构建 Streaming Lakehouse 的实践。Paimon 支持流式和批处理,提供高性能、统一的数据访问和流批一体的优势。通过示例代码和实践经验,展示了如何高效处理实时数据,解决了数据一致性和故障恢复等挑战。
126 61
|
2月前
|
消息中间件 Java Kafka
Spring Boot 与 Apache Kafka 集成详解:构建高效消息驱动应用
Spring Boot 与 Apache Kafka 集成详解:构建高效消息驱动应用
54 1
|
2月前
|
监控 Cloud Native BI
8+ 典型分析场景,25+ 标杆案例,Apache Doris 和 SelectDB 精选案例集(2024版)电子版上线
飞轮科技正式推出 Apache Doris 和 SelectDB 精选案例集 ——《走向现代化的数据仓库(2024 版)》,汇聚了来自各行各业的成功案例与实践经验。该书以行业为划分标准,辅以使用场景标签,旨在为读者提供一个高度整合、全面涵盖、分类清晰且易于查阅的学习资源库。
|
3月前
|
Java 大数据 数据库连接
大数据-163 Apache Kylin 全量增量Cube的构建 手动触发合并 JDBC 操作 Scala
大数据-163 Apache Kylin 全量增量Cube的构建 手动触发合并 JDBC 操作 Scala
42 2
大数据-163 Apache Kylin 全量增量Cube的构建 手动触发合并 JDBC 操作 Scala
|
3月前
|
SQL 分布式计算 NoSQL
大数据-164 Apache Kylin Cube优化 案例1 定义衍生维度与对比 超详细
大数据-164 Apache Kylin Cube优化 案例1 定义衍生维度与对比 超详细
40 1
大数据-164 Apache Kylin Cube优化 案例1 定义衍生维度与对比 超详细
|
3月前
|
存储 大数据 分布式数据库
大数据-165 Apache Kylin Cube优化 案例 2 定义衍生维度及对比 & 聚合组 & RowKeys
大数据-165 Apache Kylin Cube优化 案例 2 定义衍生维度及对比 & 聚合组 & RowKeys
50 1

推荐镜像

更多