1.计算昨天的日期
ds = TO_CHAR(DATEADD(TO_DATE('${bizdate}','yyyymmdd'), - 1,'dd'),'yyyymmdd')
2.计算数据的涨跌趋势
CASE WHEN col_1 < col_2 AND ABS(col_1 - col_2) >= 0.01 THEN 'down' WHEN col_1 > col_2 AND ABS(col_1 - col_2) >= 0.01 THEN 'up' WHEN ABS(col_1 - col_2) < 0.01 THEN 'eq' END AS updown
3.通过日期计算年月/年同理操作
substr(ds, 1, 6) = '${bizmonth}'
4.字符串拼接ID/切割字符串和ID
concat(1, '^', col_1) as col_1,SPLIT_PART(col_1, '^', 1) as col_2,
5.数据多列转行
trans_cols( 2, col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8, col_9, col_10 ) AS (id, col_1, col_2, col_11)
6.为空值添加默认值
COALESCE(col_1, FALSE)COALESCE(col_2, 0)
7.条件判断/模式匹配 CASE WHEN
CASE WHEN sum(col_1) < sum(col_2) AND ABS(sum(col_1) - sum(col_2)) >= 0.01 THEN 'down' WHEN sum(col_1) > sum(col_2) AND ABS(sum(col_1) - sum(col_2)) >= 0.01 THEN 'up' WHEN ABS(sum(col_1) - sum(col_2)) < 0.01 THEN 'eq' END AS updown
8.条件判断 IF ELSE
IF(col_1 = 0, -1, col_1)
9.查询表最大分区
ds = MAX_PT('db.table')
10.返回某字段并指定默认值
SELECT 0 AS col_1,'test' AS col_2FROM db.table
11.字符串格式化为日期
TO_DATE('${bizdate}', 'yyyymmdd')
12.日期转化为字符串
TO_CHAR(DATEADD(TO_DATE('${bizdate}', 'yyyymmdd'), -1, 'mm'), 'yyyymmdd')
13.计算之前/之后N天/月等日期
DATEADD(TO_DATE('${bizdate}', 'yyyymmdd'), -1, 'mm')DATEADD(TO_DATE('${bizdate}', 'yyyymmdd'), 1, 'mm')DATEADD(TO_DATE('${bizdate}', 'yyyymmdd'), 1, 'dd')DATEADD(TO_DATE('${bizdate}', 'yyyymmdd'), -1, 'dd')
14.字符串补位
LPAD(col1_1, 2, '0')
15.过滤等于0或空的数据 HAVING, 这里可以使用count等来做数量的控制
HAVING COALESCE(col_1, 0)<>0
16.计算一年的时间范围
DATE_FORMAT(DATE_SUB(DATE_FORMAT(#{end_date}, '%Y%m%d'), 366), '%Y%m%d')
17.SQL返回结果添加自增ID
SELECT ROW_NUMBER() OVER () AS id,
col_1,
col_2
FROM db.tb
注:当row_number函数与order by同级别使用时,row_number产生的序号会同未排序前的顺序结果保持一致,也就是乱序的。因此建议将
row_number放在order by外层使用;
18.SQL 中字段类型转换(这里以String转换为BigInt为例)
CAST(stat_date AS BIGINT) AS stat_date
CONVERT(stat_date, BIGINT) AS stat_date
19.SQL中统计数值范围内个数
COUNT(CASE WHEN col_1 BETWEEN 0 AND 10 THEN 1 END) AS col_2
20.笛卡尔积,两个字段汇总到一张表(有无关联字段无所谓)
并且要标记出哪个表数据量少,放在MAPJOIN的注释中;
SELECT
/*+ MAPJOIN (t_2) */t_1.col_1 AS col_1,
t_2.col_2 AS col_2
FROM
t_xxxx AS t_1,
t_yyyy AS t_2
21.Join长尾优化
长尾SQL在JOIN阶段会将Join Key相同的数据分发到一个Instance上处理。如果某个Key上的数据量较多,会导致该Instance执行时间比其它Instance执行时间长,其表现为:追星日志中该Join Task的大部分Instance多已经执行完成,但少数几个Instance一直处于执行中;
常见解决方案
MapJoin
可以采用MapJoin避免数据倾斜,在SELECT后面加上如下语句:
/*+ mapjoin(t_1) */
其中t_1代表其中的相对来说较小的表或者子查询;
处理空值;
统一去掉空值,或者转化成为随机数(数量级别上可以很大或者很小,不要影响当前字段的其它数据);
热点Key问题
找到热点Key,切分成多个小表/子查询等,再来处理。
22.修改分区生命周期
alter table yyyy.xxxx set LIFECYCLE 731
23.统计重复条数
SELECT
id,
COUNT(*)
FROM
db.t_1
WHERE
ds=MAX_PT('db.t_1')
GROUP BY
id
HAVING COUNT(*) > 1
)
24.增加字段
alter table db.tb ADD COLUMNS(col1 BIGINT COMMENT '1是,0否')
25.只匹配包含中文的字符串
COLUMN RLIKE '[^\x00-\xff]+'
26.工号前面补0(避免匹配中有些带有前置零的情况)
IF (LENGTH(work_id) < 6 AND work_id NOT RLIKE '^[[:alpha:]]', CONCAT(REPEAT('0', 6 - LENGTH(work_id)), work_id), work_id)
27.打平连接(可选择分隔符)
wm_concat('、', column)
28.列转行(多列转为三、二列,形成kv结构的表)
WITH t_bid_1 AS (
SELECT
col_1,
col_2,
...
col_n
FROM
db.table
), t_bid_2 AS (
SELECT
id,
TRIM(SPLIT_PART(field, '^', 1)) AS KEY,
TRIM(SPLIT_PART(field, '^', 2)) AS VALUE
FROM (
SELECT
id,
CONCAT("col_1", "^", col_1) AS col_1,
CONCAT("col_2", "^", col_2) AS col_2,
...
CONCAT("col_n", "^", col_n) AS col_n
FROM
t_bid_1
)
LATERAL VIEW EXPLODE(ARRAY(id,
col_1,
col_2,
...
col_n
)
) tb_view AS field
)
29.去掉字符串中括号
SELECT REGEXP_REPLACE('xxxx(yy)zzz', '([(.*?)])', '' )
xxxxyyzzz
30.获取上个月/本月最后一天
2021-07-31 -> LAST_DAY(DATEADD(TO_DATE('${bizdate}', 'yyyymmdd'), - 1, 'mm'))
20210731 -> REPLACE(LAST_DAY(DATEADD(TO_DATE('${bizdate}', 'yyyymmdd'), - 1, 'mm')), "-", "")
语法:
datetime lastday(datetime <date>)
string last_day(string <date>)