物联网兴起,GPS终端也越来越普及,比如车载的终端,带GPS功能的手表,手机等等。
比如滴滴打车,出租车都记录了车辆的行驶的位点。
位点通常会带有终端ID,经纬度,时间等信息。
但是我们如何将这些点的信息合并成车辆的运行轨迹呢?并不是粗暴的聚合这么简单。
因为车辆在使用过程中会遇到等红灯,停车等乘客,穿越没有GPS信号的地方(如隧道),或者终端异常,未打开GPS记录等情况。
直接将行驶记录的位置信息串起来之后并不是真正的运行轨迹。
那么怎么合并或切分成有价值的连续行驶的数据呢?
一种较为通用的手段,通过距离,时间两个维度的差值阈值,可以将一个大的轨迹切分成多个连续的轨迹片段。
为了演示方便,我这里以二维的点状数据为例,给大家展示一下切分和合并的方法。
实际生产通常使用的是PostgreSQL的地理位置插件PostGIS(被称之为"宇宙最强"的开源GIS管理系统)。
测试数据
postgres=# create table test (id int, pos point, ts timestamp);
CREATE TABLE
id 终端ID
pos 位置
ts 时间
生成100万测试记录
postgres=# insert into test select trunc(1000*random()), ( '('||trunc(1000*random())||','||trunc(1000*random())||')' )::point, now() + (''||trunc(10000*random())||' second') ::interval from generate_series(1,1000000);
INSERT 0 1000000
postgres=# select * from test where id=1 order by ts limit 10;
id | pos | ts
----+-----------+----------------------------
1 | (242,463) | 2016-06-10 23:50:18.433406
1 | (869,986) | 2016-06-10 23:50:18.433406
1 | (542,704) | 2016-06-10 23:50:19.433406
1 | (746,42) | 2016-06-10 23:50:32.433406
1 | (843,161) | 2016-06-10 23:50:45.433406
1 | (40,288) | 2016-06-10 23:50:55.433406
1 | (617,176) | 2016-06-10 23:51:05.433406
1 | (615,198) | 2016-06-10 23:51:07.433406
1 | (491,361) | 2016-06-10 23:51:14.433406
1 | (566,956) | 2016-06-10 23:51:20.433406
(10 rows)
如果进来的是数组,首先使用unnest解析成行, 例子
postgres=# select unnest(array[(1,'(1,1)','2016-01-01')::test, (1,'(1,100)','2016-01-02')::test]);
unnest
-------------------------------------
(1,"(1,1)","2016-01-01 00:00:00")
(1,"(1,100)","2016-01-02 00:00:00")
(2 rows)
进来的如果不是数组,已经是行,可以省略前面一步。
切分DEMO
设置两个维度的阈值
.1. 相邻两点的距离
距离很大,说明可能是中间异常了。
.2. 相邻两点的时间差
距离很短,但是时间很长,说明可能是车子静止了,例如停车等乘客,或者吃饭什么的。
为了方便解释,分解成几步
创建视图
create view v1 as
select
id,
pos_e,
ts_e,
rn,
case when rn>1 then point_distance(pos_s, pos_e) else 0 end pos_distance,
case when rn>1 then ts_e-ts_s else interval '0 sec' end ts_distance
from
( select
id,
row_number() over (partition by id order by ts) rn,
lag(pos) over (partition by id order by ts) pos_s,
pos pos_e,
lag(ts) over (partition by id order by ts) ts_s,
ts ts_e
from test ) t;
postgres=# select * from v1 limit 10;
id | pos_e | ts_e | rn | pos_distance | ts_distance
----+-----------+----------------------------+----+------------------+-------------
0 | (571,395) | 2016-06-10 23:50:21.433406 | 1 | 0 | 00:00:00
0 | (548,943) | 2016-06-10 23:50:37.433406 | 2 | 548.482451861498 | 00:00:16
0 | (638,497) | 2016-06-10 23:50:57.433406 | 3 | 454.990109782619 | 00:00:20
0 | (5,214) | 2016-06-10 23:50:57.433406 | 4 | 693.381568834938 | 00:00:00
0 | (355,720) | 2016-06-10 23:51:00.433406 | 5 | 615.252793573503 | 00:00:03
0 | (472,670) | 2016-06-10 23:51:10.433406 | 6 | 127.23600119463 | 00:00:10
0 | (930,952) | 2016-06-10 23:51:29.433406 | 7 | 537.854999047141 | 00:00:19
0 | (893,145) | 2016-06-10 23:51:31.433406 | 8 | 807.847757934625 | 00:00:02
0 | (700,91) | 2016-06-10 23:51:45.433406 | 9 | 200.412075484488 | 00:00:14
0 | (465,697) | 2016-06-10 23:51:52.433406 | 10 | 649.96999930766 | 00:00:07
(10 rows)
视图字段含义
postgres=# \d+ v1
View "public.v1"
Column | Type | Modifiers | Storage | Description
--------------+-----------------------------+-----------+---------+-------------
id | integer | | plain | 终端ID
pos_e | point | | plain | 位置
ts_e | timestamp without time zone | | plain | 时间点
rn | bigint | | plain | 终端轨迹按时间顺序的序号
pos_distance | double precision | | plain | 与前一个点的距离
ts_distance | interval | | plain | 与前一个点的时间差
得到切分点的SQL
postgres=#
select *,
case when rn=1 or (pos_distance>100 and ts_distance>interval '20 sec') then 'split' else '' end
from v1 limit 100;
id | pos_e | ts_e | rn | pos_distance | ts_distance | case
----+-----------+----------------------------+-----+------------------+-------------+-------
0 | (571,395) | 2016-06-10 23:50:21.433406 | 1 | 0 | 00:00:00 | split
0 | (548,943) | 2016-06-10 23:50:37.433406 | 2 | 548.482451861498 | 00:00:16 |
0 | (638,497) | 2016-06-10 23:50:57.433406 | 3 | 454.990109782619 | 00:00:20 |
0 | (5,214) | 2016-06-10 23:50:57.433406 | 4 | 693.381568834938 | 00:00:00 |
0 | (355,720) | 2016-06-10 23:51:00.433406 | 5 | 615.252793573503 | 00:00:03 |
0 | (472,670) | 2016-06-10 23:51:10.433406 | 6 | 127.23600119463 | 00:00:10 |
0 | (930,952) | 2016-06-10 23:51:29.433406 | 7 | 537.854999047141 | 00:00:19 |
0 | (893,145) | 2016-06-10 23:51:31.433406 | 8 | 807.847757934625 | 00:00:02 |
0 | (700,91) | 2016-06-10 23:51:45.433406 | 9 | 200.412075484488 | 00:00:14 |
0 | (465,697) | 2016-06-10 23:51:52.433406 | 10 | 649.96999930766 | 00:00:07 |
0 | (890,202) | 2016-06-10 23:51:58.433406 | 11 | 652.418577295282 | 00:00:06 |
0 | (271,425) | 2016-06-10 23:52:22.433406 | 12 | 657.943766594076 | 00:00:24 | split
0 | (771,898) | 2016-06-10 23:52:27.433406 | 13 | 688.279739640795 | 00:00:05 |
0 | (392,348) | 2016-06-10 23:52:28.433406 | 14 | 667.9378713623 | 00:00:01 |
0 | (705,388) | 2016-06-10 23:52:30.433406 | 15 | 315.545559309587 | 00:00:02 |
0 | (393,176) | 2016-06-10 23:52:39.433406 | 16 | 377.210816387866 | 00:00:09 |
0 | (134,690) | 2016-06-10 23:52:39.433406 | 17 | 575.566677284222 | 00:00:00 |
0 | (232,108) | 2016-06-10 23:52:42.433406 | 18 | 590.193188710273 | 00:00:03 |
0 | (106,401) | 2016-06-10 23:53:05.433406 | 19 | 318.943568676341 | 00:00:23 | split
0 | (217,665) | 2016-06-10 23:53:19.433406 | 20 | 286.386103014794 | 00:00:14 |
0 | (348,467) | 2016-06-10 23:53:41.433406 | 21 | 237.413142011979 | 00:00:22 | split
0 | (424,654) | 2016-06-10 23:53:50.433406 | 22 | 201.853907566834 | 00:00:09 |
0 | (389,515) | 2016-06-10 23:54:06.433406 | 23 | 143.33875958721 | 00:00:16 |
0 | (238,834) | 2016-06-10 23:54:06.433406 | 24 | 352.933421483429 | 00:00:00 |
0 | (219,85) | 2016-06-10 23:54:17.433406 | 25 | 749.24094922795 | 00:00:11 |
0 | (4,561) | 2016-06-10 23:54:25.433406 | 26 | 522.303551586623 | 00:00:08 |
0 | (815,512) | 2016-06-10 23:54:39.433406 | 27 | 812.47892280354 | 00:00:14 |
0 | (756,592) | 2016-06-10 23:54:41.433406 | 28 | 99.4032192637643 | 00:00:02 |
0 | (820,732) | 2016-06-10 23:54:58.433406 | 29 | 153.935051239151 | 00:00:17 |
0 | (439,749) | 2016-06-10 23:55:04.433406 | 30 | 381.379076510498 | 00:00:06 |
0 | (260,64) | 2016-06-10 23:55:07.433406 | 31 | 708.00141242797 | 00:00:03 |
0 | (386,535) | 2016-06-10 23:55:18.433406 | 32 | 487.562303711023 | 00:00:11 |
0 | (414,226) | 2016-06-10 23:55:32.433406 | 33 | 310.266014896895 | 00:00:14 |
0 | (425,533) | 2016-06-10 23:55:56.433406 | 34 | 307.197005193736 | 00:00:24 | split
0 | (970,845) | 2016-06-10 23:55:58.433406 | 35 | 627.988057211282 | 00:00:02 |
写成函数,处理轨迹的合并和切分。
每条轨迹以数组的形式返回。
(如果你要返回带有经纬度和时间属性的GIS类型轨迹,改一下这个函数内容即可)。
create or replace function path_agg(
i_pos_distance int, -- 距离
i_ts_distance interval -- 时间差
) returns setof test[] as
$$
declare
tmp_id int;
i_id int;
i_pos point;
i_ts timestamp;
i_split boolean;
res test[];
begin
for i_id,i_pos,i_ts,i_split in select id, pos_e, ts_e, case when (pos_distance > i_pos_distance and ts_distance > i_ts_distance) or rn=1 then true else false end from v1 LOOP
if tmp_id is null or res is null then
-- 第一行, 生成初始 res
res := array[(i_id, i_pos, i_ts)::test];
elsif tmp_id <> i_id then
-- 换ID, 输出next, 生成初始res
return next res;
res := array[(i_id, i_pos, i_ts)::test];
elsif i_split then
-- 切分, 输出next, 生成初始res
return next res;
res := array[(i_id, i_pos, i_ts)::test];
else
-- 合并
res := array_append(res, (i_id, i_pos, i_ts)::test);
end if;
tmp_id := i_id;
end loop;
-- 输出最后一个res
return next res;
-- 返回
return;
end;
$$
language plpgsql strict;
验证
postgres=# select * from path_agg(100, interval '20 second');
{"(0,\"(571,395)\",\"2016-06-10 23:50:21.433406\")","(0,\"(548,943)\",\"2016-06-10 23:50:37.433406\")","(0,\"(638,497)\",\"2016-06-10 23:50:57.433406\")","(0,\"(5,214)\",\"2016-06-10 23:50:57.433406\")","(0,\"(355,720)\",\"2016-06-10 23
:51:00.433406\")","(0,\"(472,670)\",\"2016-06-10 23:51:10.433406\")","(0,\"(930,952)\",\"2016-06-10 23:51:29.433406\")","(0,\"(893,145)\",\"2016-06-10 23:51:31.433406\")","(0,\"(700,91)\",\"2016-06-10 23:51:45.433406\")","(0,\"(465,697)\
",\"2016-06-10 23:51:52.433406\")","(0,\"(890,202)\",\"2016-06-10 23:51:58.433406\")"}
{"(0,\"(271,425)\",\"2016-06-10 23:52:22.433406\")","(0,\"(771,898)\",\"2016-06-10 23:52:27.433406\")","(0,\"(392,348)\",\"2016-06-10 23:52:28.433406\")","(0,\"(705,388)\",\"2016-06-10 23:52:30.433406\")","(0,\"(393,176)\",\"2016-06-10
23:52:39.433406\")","(0,\"(134,690)\",\"2016-06-10 23:52:39.433406\")","(0,\"(232,108)\",\"2016-06-10 23:52:42.433406\")"}
{"(0,\"(106,401)\",\"2016-06-10 23:53:05.433406\")","(0,\"(217,665)\",\"2016-06-10 23:53:19.433406\")"}
......
小结
本文主要用到了PostgreSQL的几个功能特性。
地理位置信息管理,窗口函数,plpgsql函数语言,复合类型,数组类型,返回多行的函数。