01-Clickhouse-阿里云开发者社区

一.介绍

1.clickhouse 特点

查询速度快
支持 SQL
采用 MPP 架构
列式存储
向量化执行引擎

消除循环来提高效率，比如需要循环 3 次才能完成的工作，转化为 3 个工作并行处理。

适用于 OLAP 场景
相同场景的不同数据量使用不同算法, 比如

常量字符串查询，使用 volnitsky 算法。
非常量字符串，使用 CPU 的向量化 SIMD 指令。
字符串正则匹配，使用 re2 和 hyperscan 算法。

2.应用场景

大多数是读请求
数据总是以相当大的批(> 1000 rows)进行写入
不修改已添加的数据
每次查询都从数据库中读取大量的行，但是同时又仅需要少量的列
宽表，即每个表包含着大量的列
较少的查询(通常每台服务器每秒数百个查询或更少)
对于简单查询，允许延迟大约 50 毫秒
列中的数据相对较小：数字和短字符串(例如，每个 UR60 个字节)
处理单个查询时需要高吞吐量（每个服务器每秒高达数十亿行）
事务不是必须的
对数据一致性要求低
每一个查询除了一个大表外都很小
查询结果明显小于源数据，换句话说，数据被过滤或聚合后能够被盛放在单台服务器的内存中

3.使用的存储引擎?

MergeTree：基于时间排序
ReplacingMergeTree
SummingMergeTree
AggregatingMergeTree
CollapsingMergeTree
VersionedCollapsingMergeTree
GraphiteMergeTree

4.使用 clickhouse 注意事项

严格区分大小写,注意库名和字段的大小写
子查询的查询结果需要加 as 别名
不支持 ndv 函数,支持使用 count(distinct column)

使用方法,如果设置如下参数：
set APPX_COUNT_DISTINCT=true;
则所有的 count(distinct col)会在底层计算的时候转成 ndv() 函数，也就是说，在 sql 中可以直接使用 count(distinct col),如果不配置上述参数，则在 sql 中直接写 ndv(col) 也可以

ifnull 函数在 null 时需要有默认值 ifnull(ddopsd.all_sal_act_qty,0)
查询条件字段最好加上''单引号,避免类型不匹配
涉及到计算函数的字段,字段类型必须是 Number 类型,不能是 String
超过 50g 的表不能直接删除,需要添加一个空文件 sudo touch /data/clickhouse/flags/force_drop_table && sudo chmod 666 /data/clickhouse/flags/force_drop_table
创建表的 order by 会影响查询效率(相当于 mysql 的索引)
left join 尽量使用 any left join
存在 null 值的字段不能指定为 order by 索引
clickhouse 的字段是 Int32 时,插入数据不能为 null
null 值不能转化为 Int32 类型,会报错
clickhouse 在 21.3.1 以后的版本支持开窗函数
clickhouse 的字段是 Int32 时,插入数据不能为 null
空值问题

空表，Nullable 与非空类型可以互转；
Nullable 字段，如果记录不带有 Null 值，可以从 Nullable 转成非空类型；
含有 null 值的字段不允许转成非空类型；
Nullable 字段不允许用于 order by；

5.clickhouse 支持的函数

支持的函数

二.系统 SQL

1.大于 50g 不能删除

sudo touch /data/clickhouse/flags/force_drop_table && sudo chmod 666 /data/clickhouse/flags/force_drop_table

2.查询表的数据量

要统计 ClickHouse 中数据量最大的表，并按数据量降序排列，可以执行以下 SQL 语句：

SELECT

table,

sum(bytes) AS size

FROM system.parts

GROUP BY table

ORDER BY size DESC;

查询表行数和数据量

SELECT database, name, total_rows, round(total_bytes / 1024 / 1024 / 1024, 4) as total_memory

from system.tables t

where t.database != 'system'

order by t.database, t.name

3.查看异步删除是否完成

SELECT

database,

table,

command,

create_time,

is_done

FROM system.mutations

order by create_time DESC

LIMIT 10

4.查询执行计划

select * from system.query_log

WHERE query_kind ='Alter'

order by query_start_time desc

5.查看版本号

SELECT version();

6.查询 ck 下的所有数据库

SELECT * from system.databases d ;

7.查询是否开启开窗函数

SELECT * from `system`.settings s where name = 'allow_experimental_window_functions'

allow_experimental_window_functions=1 代表开启
allow_experimental_window_functions=0 未开启

8.开窗函数配置

, sum(ifnull(hot_size_sal_qty, 0)) over (partition by product_code order by period_sdate asc rows between unbounded preceding and current row) as total_hot_size_sal_qty

,rank() over (partition by period_sdate order by total360_sal_qty_store_rate desc) as day_360_sal_qty_store_rate_rank

三.新增操作

1.创建库

#使用默认库引擎创建库

CREATE DATABASE IF NOT EXISTS chtest;

2.创建表

CREATE TABLE default.boss

(

row_id String,

user_id Int32

) ENGINE = MergeTree() ORDER BY

(row_id) SETTINGS index_granularity = 16384;

设置策略

CREATE TABLE your_table_name (

column1 data_type,

column2 data_type,

columnN data_type

) SETTINGS (

index_granularity = 8192,

storage_policy = 'beinsight'

);

3.select 建表

create table t_name_8888

ENGINE = MergeTree

ORDER BY

tuple()

SETTINGS index_granularity = 8192

select

from

dw_1_sad limit 0,1;

4.clickhouse 多个 order

CREATE TABLE bi.boss_info2

(

row_id String,

user_id Int32,

offline_props_time String,

offline_vip_distribute String,

offline_vip_time String,

pay_now String,

data_dt Date

) ENGINE = MergeTree() PARTITION BY data_dt ORDER BY

(industry, l1_name, l2_name, l3_name, job_city, job_area)

SETTINGS index_granularity = 16384;

5.创建物化视图

create materialized view views.o6

engine = MergeTree

order by period_sdate

POPULATE

select xxxx

6.新增列

ALTER TABLE `default`.belle_out ADD COLUMN product_year_name Nullable(String);

ALTER TABLE `default`.belle_out ADD COLUMN season_name Nullable(String);

7.插入语句

INSERT INTO

default.sales_w (`suppkey`, `brand`, `AA`, `AB`, `AC`, `AD`)

VALUES

(1, 'nike', 99, 98, 97, 96);

INSERT INTO

default.sales_w (`suppkey`, `brand`, `AA`, `AB`, `AC`, `AD`)

VALUES

(2, 'nike', 99, 98, 97, 96);

INSERT INTO

default.sales_w (`suppkey`, `brand`, `AA`, `AB`, `AC`, `AD`)

VALUES

(3, 'nike', 99, 98, 97, 96);

8.通过查询新增

insert into default.tmp_dws_day_org_pro_size_inv_ds select t1.period_sdate as period_sdate,t3.PRODUCT_CODE as PRODUCT_CODE, t1.size_code as size_code, t1.store_key as store_key from default.dws_day_org_pro_size_inv_ds t1 any left join default.dim_org_allinfo t2 on (t1.store_key = t2.organ_key) any left join default.dim_pro_allinfo t3 on (t1.product_key = t3.PRODUCT_KEY) where t1.period_sdate >= '$year-$month-01' and t1.period_sdate <= '$year-$month-31';

9.create table

1.普通建表

CREATE TABLE dis_j.D_F1_shard on cluster cluster_demo (

`product_code` String,

`package_name` String

) ENGINE = MergeTree ORDER BY package_name SETTINGS index_granularity = 8192

2.分布表

CREATE TABLE dis_j.D_F1_all on cluster cluster_demo as dis_j.D_F1_shard

ENGINE = Distributed('cluster_demo', 'dis_j', D_F1_shard, rand())

3.复制表

复制已有的一个表创建表。如果不指定 engine，默认会复制源表 engine。

CREATE TABLE dis_j.tmp1 as dis_j.D_F1_shard

4.集群上复制表

复制已有的一个表创建表。在集群上执行，要把 on cluster 写在 as 前面。

CREATE TABLE dis_j.tmp1 on cluster cluster_demo as dis_j.D_F1_shard

5.select 创建表

使用 select 查询结果来创建一个表，需要指定 engine。字段列表会使用查询结果的字段列表。

CREATE TABLE dis_j.tmp1 ENGINE = MergeTree ORDER BY package_name AS select * from dis_j.D_F1_shard

6.分区表上再分区

最后，在分区表之上再创建分区表可以吗？

–在 ck 中创建表：

create table dis_j.t_area_shard on cluster cluster_demo

(

area_id String,

area_name String

)ENGINE = MergeTree ORDER BY area_id SETTINGS index_granularity = 8192

–分布表

CREATE TABLE dis_j.t_area_all on cluster cluster_demo as dis_jiakai.t_area_shard

ENGINE = Distributed('cluster_demo', 'dis_j', t_area_shard, rand())

CREATE TABLE dis_jiakai.t_area_all2 on cluster cluster_demo as dis_jiakai.t_area_all

ENGINE = Distributed('cluster_demo', 'dis_jiakai', t_area_all, rand())

执行成功！

试着查询一下：表可建，但不可用！

select * from dis_jiakai.t_area_all2

SQL 错误 [48]: [ClickHouse](https://so.csdn.net/so/search?q=ClickHouse) exception, code: 48, host: 10.9.20.231, port:

8123; Code: 48, e.displayText() = DB::Exception: Distributed on

Distributed is not supported (version 19.9.2.4 (official build))

四.查询 SQL

1.查询年月

#查询年月

SELECT

DISTINCT year(period_sdate) as years,

month(period_sdate) as months

from

tmp_dws_day_org_pro_size_inv_ds_r1

order by

years,

months;

#查询分

-- 获取日期分

SELECT formatDateTime(now(),'%Y-%M-%d %H:%M');

-- 获取开始的分钟

SELECT toStartOfMinute(NOW()) as event_time;

2.计数 sql

SELECT

COUNT(1)

from

default.tmp_dws_day_org_pro_size_inv_ds_r1

WHERE

period_sdate >= '2019-03-01'

and period_sdate <= '2019-03-31';

3.排序函数

SELECT

rowNumberInAllBlocks()+ 1 AS total_SAL_rank

FROM

(

SELECT

o2.PERIOD_SDATE,

o2.total_SAL_QTY

FROM

ORDER BY o2.total_SAL_QTY DESC

LIMIT 1 BY o2.PERIOD_SDATE

);

4.日期处理

将 int 类型转为 date 类型

parseDateTimeBestEffort(toString(20191201000407)) as wet

5.clickhouse 中的 join

ClickHouse JOIN 查询语法如下：

从 right_table 读取该表全量数据，在内存中构建 HASH MAP；
从 left_table 分批读取数据，根据 JOIN KEY 到 HASH MAP 中进行查找，如果命中，则该数据作为 JOIN 的输出；

从这个实现中可以看出，如果 right_table 的数据量超过单机可用内存空间的限制，则 JOIN 操作无法完成。通常，两表 JOIN 时，将较小表作为 right_table.

1.创建表

CREATE TABLE default.tmp_pro_size_contribution_rate

(

`product_key` String,

`size_code` String,

`sal_rank` UInt64,

`total_sal_qty` Nullable(Int64),

`total_sal_size_qty` Nullable(Int64),

`contribution_rate` Nullable(Float64)

)

ENGINE = MergeTree

ORDER BY (product_key,

size_code)

SETTINGS index_granularity = 8192;

CREATE TABLE default.tmp_pro_size_t12

(

`product_key` String,

`size_code` String,

`name` String

)

ENGINE = MergeTree

ORDER BY (product_key,

size_code)

SETTINGS index_granularity = 8192;

2.插入数据

INSERT INTO default.tmp_pro_size_contribution_rate (`product_key`, `size_code`, `sal_rank`, `total_sal_qty`,

`total_sal_size_qty`, `contribution_rate`)

VALUES ('1', '215', 99, 98, 97, 50);

INSERT INTO default.tmp_pro_size_contribution_rate (`product_key`, `size_code`, `sal_rank`, `total_sal_qty`,

`total_sal_size_qty`, `contribution_rate`)

VALUES ('1', '220', 99, 98, 97, 50);

INSERT INTO default.tmp_pro_size_contribution_rate (`product_key`, `size_code`, `sal_rank`, `total_sal_qty`,

`total_sal_size_qty`, `contribution_rate`)

VALUES ('1', '225', 99, 98, 97, 50);

INSERT INTO default.tmp_pro_size_contribution_rate (`product_key`, `size_code`, `sal_rank`, `total_sal_qty`,

`total_sal_size_qty`, `contribution_rate`)

VALUES ('1', '230', 99, 98, 97, 50);

INSERT INTO default.tmp_pro_size_contribution_rate (`product_key`, `size_code`, `sal_rank`, `total_sal_qty`,

`total_sal_size_qty`, `contribution_rate`)

VALUES ('1', '235', 99, 98, 97, 50);

INSERT INTO default.tmp_pro_size_contribution_rate (`product_key`, `size_code`, `sal_rank`, `total_sal_qty`,

`total_sal_size_qty`, `contribution_rate`)

VALUES ('1', '240', 99, 98, 97, 50);

INSERT INTO default.tmp_pro_size_contribution_rate (`product_key`, `size_code`, `sal_rank`, `total_sal_qty`,

`total_sal_size_qty`, `contribution_rate`)

VALUES ('1', '245', 99, 98, 97, 50);

INSERT INTO default.tmp_pro_size_contribution_rate (`product_key`, `size_code`, `sal_rank`, `total_sal_qty`,

`total_sal_size_qty`, `contribution_rate`)

VALUES ('1', '250', 99, 98, 97, 50);

INSERT INTO default.tmp_pro_size_t12