MySQL 5.7的一个新特性,generated column
http://dev.mysql.com/doc/refman/5.7/en/create-table.html#create-table-generated-columns
即generated column的值是普通column产生值,有点像视图,但是又有别于视图,因为它可以选择是否存储generated column产生的值。
CREATE TABLE triangle (
sidea DOUBLE,
sideb DOUBLE,
sidec DOUBLE AS (SQRT(sidea * sidea + sideb * sideb))
);
INSERT INTO triangle (sidea, sideb) VALUES(1,1),(3,4),(6,8);
mysql> SELECT * FROM triangle;
+-------+-------+--------------------+
| sidea | sideb | sidec |
+-------+-------+--------------------+
| 1 | 1 | 1.4142135623730951 |
| 3 | 4 | 5 |
| 6 | 8 | 10 |
+-------+-------+--------------------+
col_name data_type [GENERATED ALWAYS] AS (expression)
[VIRTUAL | STORED] [UNIQUE [KEY]] [COMMENT comment]
[[NOT] NULL] [[PRIMARY] KEY]
virtual不存储值,stored存储值(并支持索引)。
但是MySQL这个特性貌似用处并不大,例如要参与计算的行只能是当前行。
在物联网中,可能存在类似的需求,但是一般会要求参与计算的行是相邻的N行,或者有规则可寻的N行。例如按照相邻的5行计算平均值,最大值,最小值,方差。
MySQL 满足不了这样的需求。
在PostgreSQL中,这不是什么新鲜概念,而且支持得更彻底。
例子:
对应 mysql vitrual generated column
postgres=# create table test(c1 int, c2 int);
CREATE TABLE
postgres=# create view v_test as select c1,c2,sqrt(c1*c2+c1*c2) from test;
CREATE VIEW
postgres=# insert into test values (1,2),(10,20);
INSERT 0 2
postgres=# select * from v_test;
c1 | c2 | sqrt
----+----+------
1 | 2 | 2
10 | 20 | 20
(2 rows)
对应 mysql stored generated column
postgres=# create materialized view v_test1 as select c1,c2,sqrt(c1*c2+c1*c2) from test;
SELECT 2
postgres=# select * from v_test1;
c1 | c2 | sqrt
----+----+------
1 | 2 | 2
10 | 20 | 20
(2 rows)
还有一个更适合物联网场景的,流式处理 :
pipeline=# create stream s1(c1 int, c2 int);
CREATE STREAM
pipeline=# create continuous view test as select c1,c2,sqrt(c1*c1+c2*c2) from s1;
CREATE CONTINUOUS VIEW
pipeline=# activate;
ACTIVATE
pipeline=# insert into s1 values (1,2),(10,20);
INSERT 0 2
pipeline=# select * from test;
c1 | c2 | sqrt
----+----+------------------
1 | 2 | 2.23606797749979
10 | 20 | 22.3606797749979
(2 rows)
流式处理加窗口和实时聚合 :
pipeline=# create continuous view test1 as select c1,count(*) over(partition by c1) from s1 ;
CREATE CONTINUOUS VIEW
pipeline=# create continuous view test2 as select c2,count(*) over w from s1 window w as(partition by c2);
CREATE CONTINUOUS VIEW
pipeline=# insert into s1 values (1,2);
INSERT 0 1
pipeline=# select * from test1;
c1 | count
----+-------
1 | 1
(1 row)
pipeline=# select * from test2;
c2 | count
----+-------
2 | 1
(1 row)
实时分析每个URL的访问次数,用户数,99%用户的访问延迟低于多少。
/*
* This function will strip away any query parameters from each url,
* as we're not interested in them.
*/
CREATE FUNCTION url(raw text, regex text DEFAULT '\?.*', replace text DEFAULT '')
RETURNS text
AS 'textregexreplace_noopt' -- textregexreplace_noopt@src/backend/utils/adt/regexp.c
LANGUAGE internal;
CREATE CONTINUOUS VIEW url_stats AS
SELECT
url, -- url地址
percentile_cont(0.99) WITHIN GROUP (ORDER BY latency_ms) AS p99, -- 99%的URL访问延迟小于多少
count(DISTINCT user) AS uniques, -- 唯一用户数
count(*) total_visits -- 总共访问次数
FROM
(SELECT
url(payload->>'url'), -- 地址
payload->>'user' AS user, -- 用户ID
(payload->>'latency')::float * 1000 AS latency_ms, -- 访问延迟
arrival_timestamp
FROM logs_stream) AS unpacked
WHERE arrival_timestamp > clock_timestamp() - interval '1 day'
GROUP BY url;
CREATE CONTINUOUS VIEW user_stats AS
SELECT
day(arrival_timestamp),
payload->>'user' AS user,
sum(CASE WHEN payload->>'url' LIKE '%landing_page%' THEN 1 ELSE 0 END) AS landings,
sum(CASE WHEN payload->>'url' LIKE '%conversion%' THEN 1 ELSE 0 END) AS conversions,
count(DISTINCT url(payload->>'url')) AS unique_urls,
count(*) AS total_visits
FROM logs_stream GROUP BY payload->>'user', day;
-- What are the top-10 most visited urls?
SELECT url, total_visits FROM url_stats ORDER BY total_visits DESC limit 10;
url | total_visits
---------------+--------------
/page62/path4 | 10182
/page51/path4 | 10181
/page24/path5 | 10180
/page93/path3 | 10180
/page81/path0 | 10180
/page2/path5 | 10180
/page75/path2 | 10179
/page28/path3 | 10179
/page40/path2 | 10178
/page74/path0 | 10176
(10 rows)
-- What is the 99th percentile latency across all urls?
SELECT combine(p99) FROM url_stats;
combine
------------------
6.95410494731137
(1 row)
-- What is the average conversion rate each day for the last month?
SELECT day, avg(conversions / landings) FROM user_stats GROUP BY day;
day | avg
------------------------+----------------------------
2015-09-15 00:00:00-07 | 1.7455000000000000000000000
(1 row)
-- How many unique urls were visited each day for the last week?
SELECT day, combine(unique_urls) FROM user_stats WHERE day > now() - interval '1 week' GROUP BY day;
day | combine
------------------------+---------
2015-09-15 00:00:00-07 | 100000
(1 row)
-- Is there a relationship between the number of unique urls visited and the highest conversion rates?
SELECT unique_urls, sum(conversions) / sum(landings) AS conversion_rate FROM user_stats
GROUP BY unique_urls ORDER BY conversion_rate DESC LIMIT 10;
unique_urls | conversion_rate
-------------+-------------------
41 | 2.67121005785842
36 | 2.02713894173361
34 | 2.02034637010851
31 | 2.01958418072859
27 | 2.00045348712296
24 | 1.99714899522942
19 | 1.99438839453606
16 | 1.98083502184886
15 | 1.87983011139079
14 | 1.84906254929873
(1 row)