来自俄罗斯的ClickHouse列式数据在CentOS7配置和基础性能测试
确认当前CPU是否支持安装
[root@master ~]# grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported"
SSE 4.2 supported
预先安装必要的包
[root@master ~]# yum install -y pygpgme yum-utils
Loaded plugins: fastestmirror, langpacks
Loading mirror speeds from cached hostfile
* centos-sclo-rh: mirrors.163.com
base | 3.6 kB 00:00:00
bintray--sbt-rpm | 1.3 kB 00:00:00
centos-sclo-rh | 3.0 kB 00:00:00
extras | 2.9 kB 00:00:00
google-chrome | 1.3 kB 00:00:00
mysql-connectors-community | 2.5 kB 00:00:00
mysql-tools-community | 2.5 kB 00:00:00
mysql57-community | 2.5 kB 00:00:00
sublime-text | 2.9 kB 00:00:00
updates | 2.9 kB 00:00:00
google-chrome/primary | 1.7 kB 00:00:00
google-chrome 3/3
Package pygpgme-0.3-9.el7.x86_64 already installed and latest version
Package yum-utils-1.1.31-52.el7.noarch already installed and latest version
Nothing to do
创建必要的yum仓库配置文件
[root@master ~]# vim /etc/yum.repos.d/altinity_clickhouse.repo
贴入一下配置内容:
[altinity_clickhouse]
name=altinity_clickhouse
baseurl=https://packagecloud.io/altinity/clickhouse/el/7/$basearch
repo_gpgcheck=1
gpgcheck=0
enabled=1
gpgkey=https://packagecloud.io/altinity/clickhouse/gpgkey
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
metadata_expire=300
[altinity_clickhouse-source]
name=altinity_clickhouse-source
baseurl=https://packagecloud.io/altinity/clickhouse/el/7/SRPMS
repo_gpgcheck=1
gpgcheck=0
enabled=1
gpgkey=https://packagecloud.io/altinity/clickhouse/gpgkey
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
metadata_expire=300
确认yum仓库的配置,以及是否有clickhouse提供下载安装
[root@master ~]# yum list 'clickhouse*'
Loaded plugins: fastestmirror, langpacks
Loading mirror speeds from cached hostfile
* centos-sclo-rh: mirror.jdcloud.com
altinity_clickhouse/x86_64/signature | 833 B 00:00:00
Retrieving key from https://packagecloud.io/altinity/clickhouse/gpgkey
Importing GPG key 0x0F6E36F6:
Userid : "https://packagecloud.io/altinity/clickhouse (https://packagecloud.io/docs#gpg_signing) <support@packagecloud.io>"
Fingerprint: 7001 38a9 6a20 6b22 bf28 3c06 ed26 58f3 0f6e 36f6
From : https://packagecloud.io/altinity/clickhouse/gpgkey
Is this ok [y/N]: y
altinity_clickhouse/x86_64/signature | 1.0 kB 00:00:23 !!!
altinity_clickhouse-source/signature | 836 B 00:00:00
Retrieving key from https://packagecloud.io/altinity/clickhouse/gpgkey
Importing GPG key 0x0F6E36F6:
Userid : "https://packagecloud.io/altinity/clickhouse (https://packagecloud.io/docs#gpg_signing) <support@packagecloud.io>"
Fingerprint: 7001 38a9 6a20 6b22 bf28 3c06 ed26 58f3 0f6e 36f6
From : https://packagecloud.io/altinity/clickhouse/gpgkey
Is this ok [y/N]: y -----》输入【y】
altinity_clickhouse-source/signature | 1.0 kB 00:00:05 !!!
(1/2): altinity_clickhouse-source/primary | 995 B 00:00:04
(2/2): altinity_clickhouse/x86_64/primary | 46 kB 00:00:06
altinity_clickhouse 429/429
altinity_clickhouse-source 2/2
Available Packages
clickhouse-client.x86_64 19.15.3.6-1.el7 altinity_clickhouse
clickhouse-common-static.x86_64 19.15.3.6-1.el7 altinity_clickhouse
clickhouse-compressor.x86_64 1.1.54336-3.el7 altinity_clickhouse
clickhouse-debuginfo.x86_64 19.15.3.6-1.el7 altinity_clickhouse
clickhouse-odbc.x86_64 1.0.0.20190611-1 altinity_clickhouse
clickhouse-server.x86_64 19.15.3.6-1.el7 altinity_clickhouse
clickhouse-server-common.x86_64 19.15.3.6-1.el7 altinity_clickhouse
clickhouse-test.x86_64 19.15.3.6-1.el7 altinity_clickhouse
[root@master ~]#
正式主体部分——服务器与客户端应用
[root@master ~]# yum install -y clickhouse-server clickhouse-client
Loaded plugins: fastestmirror, langpacks
Loading mirror speeds from cached hostfile
* centos-sclo-rh: mirrors.njupt.edu.cn
Resolving Dependencies
--> Running transaction check
---> Package clickhouse-client.x86_64 0:19.15.3.6-1.el7 will be installed
---> Package clickhouse-server.x86_64 0:19.15.3.6-1.el7 will be installed
--> Processing Dependency: clickhouse-server-common = 19.15.3.6-1.el7 for package: clickhouse-server-19.15.3.6-1.el7.x86_64
--> Processing Dependency: clickhouse-common-static = 19.15.3.6-1.el7 for package: clickhouse-server-19.15.3.6-1.el7.x86_64
--> Running transaction check
---> Package clickhouse-common-static.x86_64 0:19.15.3.6-1.el7 will be installed
---> Package clickhouse-server-common.x86_64 0:19.15.3.6-1.el7 will be installed
--> Finished Dependency Resolution
Dependencies Resolved
===================================================================================================================================================================================================================
Package Arch Version Repository Size
===================================================================================================================================================================================================================
Installing:
clickhouse-client x86_64 19.15.3.6-1.el7 altinity_clickhouse 6.2 k
clickhouse-server x86_64 19.15.3.6-1.el7 altinity_clickhouse 10 M
Installing for dependencies:
clickhouse-common-static x86_64 19.15.3.6-1.el7 altinity_clickhouse 25 M
clickhouse-server-common x86_64 19.15.3.6-1.el7 altinity_clickhouse 10 k
Transaction Summary
===================================================================================================================================================================================================================
Install 2 Packages (+2 Dependent packages)
Total download size: 35 M
Installed size: 174 M
Downloading packages:
(1/4): clickhouse-client-19.15.3.6-1.el7.x86_64.rpm | 6.2 kB 00:00:02
(2/4): clickhouse-server-19.15.3.6-1.el7.x86_64.rpm | 10 MB 00:00:07
(3/4): clickhouse-server-common-19.15.3.6-1.el7.x86_64.rpm | 10 kB 00:00:00
(4/4): clickhouse-common-static-19.15.3.6-1.el7.x86_64.rpm | 25 MB 00:00:33
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Total 1.0 MB/s | 35 MB 00:00:33
Running transaction check
Running transaction test
Transaction test succeeded
Running transaction
Warning: RPMDB altered outside of yum.
** Found 13 pre-existing rpmdb problem(s), 'yum check' output follows:
akonadi-mysql-1.9.2-4.el7.x86_64 has missing requires of mariadb-server
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of hadoop
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of hadoop-hdfs
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of hadoop-yarn
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of hadoop-mapreduce
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of hbase
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of hive >= ('0', '0.12.0+cdh5.1.0', None)
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of zookeeper
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of hadoop-libhdfs
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of avro-libs
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of parquet
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of sentry >= ('0', '1.3.0+cdh5.1.0', None)
impala-kudu-2.7.0+cdh5.9.0+0-1.cdh5.9.0.p0.11.el7.x86_64 has missing requires of sentry
Installing : clickhouse-server-common-19.15.3.6-1.el7.x86_64 1/4
Installing : clickhouse-common-static-19.15.3.6-1.el7.x86_64 2/4
Installing : clickhouse-server-19.15.3.6-1.el7.x86_64 3/4
Create user clickhouse.clickhouse with datadir /var/lib/clickhouse
Installing : clickhouse-client-19.15.3.6-1.el7.x86_64 4/4
Create user clickhouse.clickhouse with datadir /var/lib/clickhouse
Verifying : clickhouse-common-static-19.15.3.6-1.el7.x86_64 1/4
Verifying : clickhouse-server-19.15.3.6-1.el7.x86_64 2/4
Verifying : clickhouse-server-common-19.15.3.6-1.el7.x86_64 3/4
Verifying : clickhouse-client-19.15.3.6-1.el7.x86_64 4/4
Installed:
clickhouse-client.x86_64 0:19.15.3.6-1.el7 clickhouse-server.x86_64 0:19.15.3.6-1.el7
Dependency Installed:
clickhouse-common-static.x86_64 0:19.15.3.6-1.el7 clickhouse-server-common.x86_64 0:19.15.3.6-1.el7
Complete!
[root@master ~]#
检查已经成功安装的组件
[root@master ~]# yum list installed 'clickhouse*'
Loaded plugins: fastestmirror, langpacks
Loading mirror speeds from cached hostfile
* centos-sclo-rh: mirrors.njupt.edu.cn
Installed Packages
clickhouse-client.x86_64 19.15.3.6-1.el7 @altinity_clickhouse
clickhouse-common-static.x86_64 19.15.3.6-1.el7 @altinity_clickhouse
clickhouse-server.x86_64 19.15.3.6-1.el7 @altinity_clickhouse
clickhouse-server-common.x86_64 19.15.3.6-1.el7 @altinity_clickhouse
[root@master ~]#
启动后台服务
[root@master ~]# /etc/init.d/clickhouse-server restart
Start clickhouse-server service: Path to data directory in /etc/clickhouse-server/config.xml: /var/lib/clickhouse/
DONE
[root@master ~]#
登录客户端进行测试
[root@master ~]# clickhouse-client
ClickHouse client version 19.15.3.6.
Connecting to localhost:9000 as user default.
Connected to ClickHouse server version 19.15.3 revision 54426.
Master :)
Master :) show databases;
SHOW DATABASES
┌─name────┐
│ default │
│ system │
└─────────┘
2 rows in set. Elapsed: 0.007 sec.
Master :) show tables;
SHOW TABLES
Ok.
0 rows in set. Elapsed: 0.002 sec.
Master :) select now()
SELECT now()
┌───────────────now()─┐
│ 2019-10-17 14:43:42 │
└─────────────────────┘
1 rows in set. Elapsed: 0.006 sec.
做下最简单的数据库测试
Master :) quit
Bye.
启动和停止服务器
[root@master ~]# service clickhouse-server stop
Stop clickhouse-server service: DONE
[root@master ~]#
[root@master ~]#
[root@master ~]# service clickhouse-server start
Start clickhouse-server service: Path to data directory in /etc/clickhouse-server/config.xml: /var/lib/clickhouse/
DONE
[root@master ~]#
基础的参数配置:
放开远程访问
[root@master ~]# vi /etc/clickhouse-server/config.xml
使用【listen_host】关键字进行查找,放开
<!-- <listen_host>::</listen_host> -->
<listen_host>::1</listen_host>
<listen_host>127.0.0.1</listen_host>
这几个配置,放开后才能远程登录访问
内存限制设置
[root@master ~]# vi /etc/clickhouse-server/users.xml
10000000000
设置数据目录
[root@master ~]# vi /etc/clickhouse-server/config.xml
<!-- Path to data directory, with trailing slash. -->
<path>/var/lib/clickhouse/</path>
<!-- Path to temporary data for processing hard queries. -->
<tmp_path>/var/lib/clickhouse/tmp/</tmp_path>
下载测试数据:
[root@master ~]# for s in `seq 1987 2017`
do
for m in `seq 1 12`
do
wget http://transtats.bts.gov/PREZIP/On_Time_On_Time_Performance_${s}_${m}.zip
done
done
用客户端登录到数据库中
[hadoop@master clickhouse]$ id
uid=1001(hadoop) gid=1001(hadoop) groups=1001(hadoop) context=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
[hadoop@master clickhouse]$ clickhouse-client
ClickHouse client version 19.15.3.6.
Connecting to localhost:9000 as user default.
Connected to ClickHouse server version 19.15.3 revision 54426.
Master :)
Master :) create database test;
CREATE DATABASE test
Ok.
0 rows in set. Elapsed: 0.004 sec.
Master :) use test;
USE test
Ok.
0 rows in set. Elapsed: 0.001 sec.
Master :)
Master :) create table t (id UInt16, name String) ENGINE = Memory;
CREATE TABLE t
(
`id` UInt16,
`name` String
)
ENGINE = Memory
Ok.
0 rows in set. Elapsed: 0.004 sec.
Master :) insert into t(id, name) values (1, 'abc'), (2, 'xxx');
INSERT INTO t (id, name) VALUES
Ok.
2 rows in set. Elapsed: 0.005 sec.
Master :) select count(*) from t;
SELECT count(*)
FROM t
┌─count()─┐
│ 2 │
└─────────┘
1 rows in set. Elapsed: 0.004 sec.
Master :)
Master :) CREATE TABLE test02( id UInt16,col1 String,col2 String,create_date date ) ENGINE = MergeTree(create_date, (id), 8192);
CREATE TABLE test02
(
`id` UInt16,
`col1` String,
`col2` String,
`create_date` date
)
ENGINE = MergeTree(create_date, id, 8192)
Ok.
0 rows in set. Elapsed: 0.010 sec.
Master :)
Master :)
Master :) CREATE TABLE distributed_table AS test02 ENGINE = Distributed(cluster, db, test02, rand());
CREATE TABLE distributed_table AS test02
ENGINE = Distributed(cluster, db, test02, rand())
Received exception from server (version 19.15.3):
Code: 170. DB::Exception: Received from localhost:9000. DB::Exception: Requested cluster 'cluster' not found.
0 rows in set. Elapsed: 0.258 sec.
Master :)
创建本地表
CREATE TABLE test02( id UInt16,col1 String,col2 String,create_date date ) ENGINE = MergeTree(create_date, (id), 8192);
ENGINE:是表的引擎类型,
MergeTree:最常用的,MergeTree要求有一个日期字段,还有主键。
Log引擎没有这个限制,也是比较常用。
ReplicatedMergeTree:MergeTree的分支,表复制引擎。
Distributed:分布式引擎。
create_date:是表的日期字段,一个表必须要有一个日期字段。
id:是表的主键,主键可以有多个字段,每个字段用逗号分隔。
8192:是索引粒度,用默认值8192即可。
创建分布式表
CREATE TABLE distributed_table AS test02 ENGINE = Distributed(cluster, db, test02, rand());
cluster:配置文件中的群集名称。
db:库名。
test02:本地表名。
rand():分片方式:随机。
intHash64():分片方式:指定字段做hash。
Distribute引擎会选择每个分发到的Shard中的”健康的”副本执行SQL
基础性能指标
CREATE TABLE ontime
( Year UInt16,
Quarter UInt8,
Month UInt8,
DayofMonth UInt8,
DayOfWeek UInt8,
FlightDate Date,
UniqueCarrier FixedString(7),
AirlineID Int32,
Carrier FixedString(2),
TailNum String,
FlightNum String,
OriginAirportID Int32,
OriginAirportSeqID Int32,
OriginCityMarketID Int32,
Origin FixedString(5),
OriginCityName String,
OriginState FixedString(2),
OriginStateFips String,
OriginStateName String,
OriginWac Int32,
DestAirportID Int32,
DestAirportSeqID Int32,
DestCityMarketID Int32,
Dest FixedString(5),
DestCityName String,
DestState FixedString(2),
DestStateFips String,
DestStateName String,
DestWac Int32,
CRSDepTime Int32,
DepTime Int32,
DepDelay Int32,
DepDelayMinutes Int32,
DepDel15 Int32,
DepartureDelayGroups String,
DepTimeBlk String,
TaxiOut Int32,
WheelsOff Int32,
WheelsOn Int32,
TaxiIn Int32,
CRSArrTime Int32,
ArrTime Int32,
ArrDelay Int32,
ArrDelayMinutes Int32,
ArrDel15 Int32,
ArrivalDelayGroups Int32,
ArrTimeBlk String,
Cancelled UInt8,
CancellationCode FixedString(1),
Diverted UInt8,
CRSElapsedTime Int32,
ActualElapsedTime Int32,
AirTime Int32,
Flights Int32,
Distance Int32,
DistanceGroup UInt8,
CarrierDelay Int32,
WeatherDelay Int32,
NASDelay Int32,
SecurityDelay Int32,
LateAircraftDelay Int32,
FirstDepTime String,
TotalAddGTime String,
LongestAddGTime String,
DivAirportLandings String,
DivReachedDest String,
DivActualElapsedTime String,
DivArrDelay String,
DivDistance String,
Div1Airport String,
Div1AirportID Int32,
Div1AirportSeqID Int32,
Div1WheelsOn String,
Div1TotalGTime String,
Div1LongestGTime String,
Div1WheelsOff String,
Div1TailNum String,
Div2Airport String,
Div2AirportID Int32,
Div2AirportSeqID Int32,
Div2WheelsOn String,
Div2TotalGTime String,
Div2LongestGTime String,
Div2WheelsOff String,
Div2TailNum String,
Div3Airport String,
Div3AirportID Int32,
Div3AirportSeqID Int32,
Div3WheelsOn String,
Div3TotalGTime String,
Div3LongestGTime String,
Div3WheelsOff String,
Div3TailNum String,
Div4Airport String,
Div4AirportID Int32,
Div4AirportSeqID Int32,
Div4WheelsOn String,
Div4TotalGTime String,
Div4LongestGTime String,
Div4WheelsOff String,
Div4TailNum String,
Div5Airport String,
Div5AirportID Int32,
Div5AirportSeqID Int32,
Div5WheelsOn String,
Div5TotalGTime String,
Div5LongestGTime String,
Div5WheelsOff String,
Div5TailNum String
) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);
CREATE TABLE ontimetest AS ontime ENGINE = Distributed(perftest_3shards_1replicas, test, ontime, rand());
导入下载的数据
for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --query="INSERT INTO ontime FORMAT CSVWithNames"; done
配置说明
配置: 3台机器,每台机器32内存,200G磁盘
数据量: 10 G, 3.4亿条
count 性能测试
select count(1) from
(select ArrTime,CRSArrTime,FlightDate from ontimetest limit 200000000) t1 ;
1000 万:0.543 15.81 MB
1亿 : 0.863 187.58 MB
2亿 : 0.913 261.31 MB
3亿 : 1.181 485.01 MB
group by 性能测试
SELECT
OriginCityName,
DestCityName,
count(*) AS flights
FROM (select OriginCityName,DestCityName from ontimetest limit 100000000) t1 GROUP BY OriginCityName, DestCityName ORDER BY flights DESC LIMIT 20 ;
100万 0.258 42.72 MB
1000万 1.336 s 438.28 MB
1亿 11.186 s 4.47 GB
2亿 21.983 s 8.89 GB
3亿 30.538 sec 13.32 GB
join 性能测试
select * from
(select ArrTime,CRSArrTime,FlightDate from ontimetest limit 100000000) t1
ALL INNER JOIN (select ArrTime,CRSArrTime,FlightDate from ontimetest limit 10000000) t2 on t1.ArrTime=t2.CRSArrTime limit 100 ;
1千万 join 10万 0.606 s 8.29 MB
1亿 join 10万 0.684 s 8.55 MB
2亿 join 10万 0.962 s 7.99 MB
3亿 join 10万 1.088 s 8.08 MB
1千万 join 100万 11.756 s 13.03 MB
1亿 join 100万 11.795 s 13.27 MB
2亿 join 100万 11.972 s 13.50 MB
3亿 join 100万 12.825s 14.47 MB
1千万 join 1000万 150.931 s 42.33 MB
1亿 join 1000万 164.897 s 43.50 MB
2亿 join 1000万 168.973 s 46.99 MB
3亿 join 1000万 169.694 49.63 MB
性能测试总结:
在count 方面,速度很快,消耗内存较大
在group by 方面,速度很快,消耗内存很大
在 join 方面,速度很快(相对于spark而言,作为实时查询系统,还是较慢),消耗内存较小,但是消耗CPU较大
其他方面:
并发较小,官网查询建议100 Queries / second,所以不适合做业务型高并发查询
列式存储:数据格式很类似,所以易于压缩,减小IO的消耗 (hadoop 是行存储,加载数据很耗费时间);在列式的查询中,速度很快,但是在查询字段较多的情况下,速度较慢
定位:
实时处理方面,处理的数据量远大于 mysql,但是count 和 group by 消耗内存较多,并发较高的情况下,业务服务器难以承受;group by,join查询时 性能还是不能达到妙级响应的要求;并发较小 ,100 Queries / second;所以不适合做业务型高并发实时查询
在批处理方面,由于其列式存储的设计,减小IO的消耗 等原因 计算性能远超 spark,hive ;100 Queries / second远高于spark几十个并发任务的数量;所以可以替代hadoop集群作为批处理离线框架是完全可行的方案。