将日常的分析指标划分为八大分析模型, 根据此分析模型的SQL模版, 只需要修改事件名和字段名, 便可以快速的进行指表统计。
分析模型SQL
事件分析
多个事件或事件属性的统计分析
无分组多个指标查询SQL
with result(day,idx0,idx1,idx2,idx3,idx4,idx5) as (
select
cast(e.day as varchar), -- 时间(day,hour,minute,month等)
-- 指标计算(根据不同的指标统计进行替换)
-- 总人数
count(distinct case when e.event = 'ta_mp_show' then e.uid else null end),
-- 总次数
count(case when e.event = 'ta_mp_show' then '*' else null end),
-- 人均次数
divide(count(case when e.event = 'ta_mp_show' then '*' else null end), count(distinct case when e.event = 'ta_mp_show' then e.uid else null end)),
-- 设备数
count(distinct case when e.event = 'ta_mp_show' then e.properties['device_no'] else null end),
-- 设备次数(每个设备发生该事件的次数)
divide(count(case when e.event = 'ta_mp_show' then '*' else null end), count(distinct case when e.event = 'ta_mp_show' then e.properties['device_no'] else null end)),
-- 某数值字段的平均值avg, 总和sum, 最大值max, 最小值min
cast(avg(case when e.event = 'ta_mp_show' then cast(element_at(e.properties,'scene') as DOUBLE) else 0 end) as decimal(18,2))
from event.action e
-- 时间条件和事件条件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai')))) and e.event in ('ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show')
group by e.day
order by (e.day) desc
)
select day, idx0,idx1,idx2,idx3,idx4,idx5 from result
有分组多个指标查询SQL
with result(day,g0,row_id,idx0,idx1,idx2,idx3,idx4,idx5) as (
select
cast(e.day as varchar), -- 时间(day,hour,minute,month等)
element_at(e.properties,'#mp_platform'), -- 分组的字段
-- 避免数据量太大, 将限制返回的分组数量
row_number() over(partition by e.day order by (element_at(e.properties,'#mp_platform')) desc),
-- 指标计算(根据不同的指标统计进行替换)
-- 总人数
count(distinct case when e.event = 'ta_mp_show' then e.uid else null end),
-- 总次数
count(case when e.event = 'ta_mp_show' then '*' else null end),
-- 人均次数
divide(count(case when e.event = 'ta_mp_show' then '*' else null end), count(distinct case when e.event = 'ta_mp_show' then e.uid else null end)),
-- 设备数
count(distinct case when e.event = 'ta_mp_show' then e.properties['device_no'] else null end),
-- 设备次数(每个设备发生该事件的次数)
divide(count(case when e.event = 'ta_mp_show' then '*' else null end), count(distinct case when e.event = 'ta_mp_show' then e.properties['device_no'] else null end)),
-- 某数值字段的平均值avg, 总和sum, 最大值max, 最小值min
cast(avg(case when e.event = 'ta_mp_show' then cast(element_at(e.properties,'scene') as DOUBLE) else 0 end) as decimal(18,2))
from event.action e
-- 时间条件和事件条件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai')))) and e.event in ('ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show')
-- 分组排序
group by e.day, element_at(e.properties,'#mp_platform')
order by (e.day) desc
)
select day,g0, idx0,idx1,idx2,idx3,idx4,idx5,
-- 聚合指标, 按百分比显示
cast(cast(divide(idx0, idx1) as decimal(18,2)) * 100 as varchar) || '%' as aggIdx0,
-- 聚合指标, 按整数显示
cast(cast(divide(idx0, idx1) as decimal(18,2)) as bigint) as aggIdx1,
-- 聚合指标, 按2位浮点数显示
cast(divide(idx0, idx1) as decimal(18,2)) as aggIdx2 from result
-- 显示前100个分组的数据
where row_id <= 100
留存分析
初始事件到目标事件留存或流失的用户分析
无分组留存人数/设备SQL
with number(n) as(
values (1),(3),(5) -- 留存日期 1日,3日, 5日
),tmp1 (day, uid)as(
select distinct
e.day,
-- 留存人数
e.uid
-- 留存设备数 element_at(e.properties, 'device_no')
from event.action e
-- 筛选初始事件, 比如求注册->登录的留存, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, uid, idx) as(
select distinct
e.day,
-- 留存人数
e.uid,
-- 留存设备数 element_at(e.properties, 'device_no')
sum(1)
from event.action e
-- 筛选目标事件, 比如求注册->登录的留存, 目标事件为登录事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$login'
group by e.day, e.uid
),result(day, n, user_count, idx) as(
select
e.day,
n,
count(distinct e.uid), -- 初始事件人数/设备数
count(distinct ee.uid) -- 初始事件到目标事件留存人数/设备数
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day = date_add('day', n, e.day)
where n <= 5 -- 小于等于最大的留存日期
group by e.day, n
order by (e.day, n) asc
)
select
cast(e.day as varchar),
max(user_count),
array_agg(idx)
from result e
group by e.day
order by (e.day) desc
有分组留存/设备人数SQL
with number(n) as(
values (1),(3),(5) -- 留存日期 1日,3日, 5日
),tmp1 (day, g0, uid)as(
select distinct
e.day,
element_at(e.properties,'media'), -- 分组字段
-- 留存人数
e.uid
-- 留存设备数 element_at(e.properties, 'device_no')
from event.action e
-- 筛选初始事件, 比如求注册->登录的留存, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, g0, uid, idx) as(
select distinct
e.day, element_at(e.properties,'media'),
-- 留存人数
e.uid,
-- 留存设备数 element_at(e.properties, 'device_no')
sum(1)
from event.action e
-- 筛选目标事件, 比如求注册->登录的留存, 目标事件为登录事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$login'
group by e.day, element_at(e.properties,'media'), e.uid
),result(day, g0, n, user_count, idx) as(
select
e.day, e.g0, n, count(distinct e.uid), count(distinct ee.uid)
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day = date_add('day', n, e.day) and e.g0 = ee.g0
where n <= 5 -- 小于等于最大的留存日期
group by e.day, e.g0, n
order by (e.day, n) asc
), order_result(day, g0, row_id, user_count, idxList) as (
select
cast(e.day as varchar), e.g0,
-- 避免分组过大导致数据量过大, 进行排序和过滤
row_number() over(partition by day order by (e.g0) desc) row_id,
max(user_count),
array_agg(idx)
from result e
-- 分组排序
group by e.day, e.g0
order by (e.day, e.g0) desc
)
select day, g0, user_count, idxList from order_result
where row_id <= 100 -- 保留前100个分组的数据
无分组流失人数/设备SQL
with number(n) as(
values (1),(3),(5) -- 留存日期 1日,3日, 5日
),tmp1 (day, uid)as(
select distinct
e.day,
-- 流失人数
e.uid
-- 流失设备数 element_at(e.properties, 'device_no')
from event.action e
-- 筛选初始事件, 比如求注册->登录的流失, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, uid, idx) as(
select distinct
e.day,
-- 流失人数
e.uid,
-- 流失设备数 element_at(e.properties, 'device_no')
sum(1)
from event.action e
-- 筛选目标事件, 比如求注册->登录的流失, 目标事件为登录事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$login'
group by e.day, e.uid
),result(day, n, user_count, idx) as(
select
e.day,
n,
count(distinct e.uid), -- 初始事件人数/设备数
count(distinct case when ee.uid is null then e.uid else null end) -- 初始事件到目标事件流失人数/设备数
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day = date_add('day', n, e.day)
where n <= 5 -- 小于等于最大的留存日期
group by e.day, n
order by (e.day, n) asc
)
select
cast(e.day as varchar),
max(user_count),
array_agg(idx)
from result e
group by e.day
order by (e.day) desc
有分组流失人数/设备SQL
with number(n) as(
values (1),(3),(5) -- 留存日期 1日,3日, 5日
),tmp1 (day, g0, uid)as(
select distinct
e.day,
element_at(e.properties,'media'), -- 分组字段
-- 留存人数
e.uid
-- 留存设备数 element_at(e.properties, 'device_no')
from event.action e
-- 筛选初始事件, 比如求注册->登录的留存, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, g0, uid, idx) as(
select distinct
e.day, element_at(e.properties,'media'),
-- 留存人数
e.uid,
-- 留存设备数 element_at(e.properties, 'device_no')
sum(1)
from event.action e
-- 筛选目标事件, 比如求注册->登录的留存, 目标事件为登录事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$login'
group by e.day, element_at(e.properties,'media'), e.uid
),result(day, g0, n, user_count, idx) as(
select
e.day, e.g0, n, count(distinct e.uid),
count(distinct case when ee.uid is null then e.uid else null end) -- 初始事件到目标事件流失人数/设备数
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day = date_add('day', n, e.day) and e.g0 = ee.g0
where n <= 5 -- 小于等于最大的留存日期
group by e.day, e.g0, n
order by (e.day, n) asc
), order_result(day, g0, row_id, user_count, idxList) as (
select
cast(e.day as varchar), e.g0,
-- 避免分组过大导致数据量过大, 进行排序和过滤
row_number() over(partition by day order by (e.g0) desc) row_id,
max(user_count),
array_agg(idx)
from result e
-- 分组排序
group by e.day, e.g0
order by (e.day, e.g0) desc
)
select day, g0, user_count, idxList from order_result
where row_id <= 100 -- 保留前100个分组的数据
LTV分析
初始事件到营收事件的用户收益分析
无分组LTV分析
with number(n) as(
values (1),(2),(3) -- LTV日期 1日(当日),3日, 5日
),tmp1 (day, uid)as(
select distinct
e.day, e.uid
from event.action e
-- 筛选初始事件, 比如求注册->购买的LTV, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, uid, idx) as(
select
e.day, e.uid,
-- 计算该字段的收益之和
sum(cast(element_at(e.properties,'#vp@revenue_amount') as DOUBLE))
from event.action e
-- 筛选目标事件, 比如求注册->购买的LTV, 目标事件为购买事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = 'payment'
group by e.day, e.uid
),result(day, n, user_count, idx) as(
select
e.day, n,
count(distinct e.uid), -- 初始事件人数
coalesce(sum(idx),0) -- 初始事件到目标事件产生收益之和
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day between e.day and date_add('day', n-1, e.day)
where n <= 3
group by e.day, n
order by (e.day, n) asc
)
select
cast(e.day as varchar),
max(user_count),
array_agg(idx)
from result e
group by e.day
order by (e.day) desc
有分组LTV分析
with number(n) as(
values (1),(2),(3) -- LTV日期 1日(当日),3日, 5日
),tmp1 (day, g0, uid)as(
select distinct
e.day,
element_at(e.properties,'media'), -- 分组字段
e.uid
from event.action e
-- 筛选初始事件, 比如求注册->购买的LTV, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, g0, uid, idx) as(
select
e.day, element_at(e.properties,'media'), e.uid,
-- 计算该字段的收益之和
sum(cast(element_at(e.properties,'#vp@revenue_amount') as DOUBLE))
from event.action e
-- 筛选目标事件, 比如求注册->购买的LTV, 目标事件为购买事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = 'payment'
group by e.day, element_at(e.properties,'media'), e.uid
),result(day, g0, n, user_count, idx) as(
select
e.day, e.g0, n,
count(distinct e.uid), -- 初始事件人数
coalesce(sum(idx),0) -- 初始事件到目标事件产生收益之和
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day between e.day and date_add('day', n-1, e.day) and e.g0 = ee.g0
where n <= 3
group by e.day, e.g0, n
order by (e.day, n) asc
), order_result(day, g0, row_id, user_count, idxList) as (
select
cast(e.day as varchar), e.g0,
row_number() over(partition by day order by (e.g0) desc) row_id,
max(user_count),
array_agg(idx)
from result e
group by e.day, e.g0
order by (e.day, e.g0) desc
)
select day, g0, user_count, idxList from order_result
where row_id <= 100
分布分析
用户事件发生频次或属性数值区间分布分析
无分组数组分布分析SQL
with number(n,array_data) as (
SELECT
CAST(subscript AS INTEGER) subscrip, ARRAY[60,100,200] -- 数组分段区间
FROM (VALUES(sequence(2,4+1,1))) AS t1(array_subscript) -- 数组长度加一
CROSS JOIN UNNEST(array_subscript) AS t2(subscript)
), tmp1(day, uid, idx) as (
select
e.day,
e.uid,
-- 计算每个用户该字段的总和
sum(COALESCE(cast(element_at(e.properties,'scene') as DOUBLE),0))
from event.action e
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai')))) and e.event = 'ta_mp_show'
group by e.day, e.uid
),tmp2(day, user,min_value,max_value,arr_value) as (
select
day,
count(distinct uid),
min(idx),
max(idx),
array_agg(idx)
from tmp1
group by day
)
select
-- 分组字段
cast(day as varchar) "日期",
max(user) "总人数",
array_agg(concat(
case when n-1=1 then '(-∞' else concat('[',cast(array_data[n-2] as varchar)) end,
',',
case when n-1=4 then '+∞)' else concat(cast(array_data[n-1] as varchar),')') end) -- d为数组长度
) gap_list,
array_agg(cardinality(filter(
arr_value, x -> IF(n-1=1,min_value-1,array_data[n-2]) <= x and x< IF(n-1<4,array_data[n-1],max_value+1) ))
) value_list
from tmp2
cross join number
where n-1 <= 4 -- 4为数组长度
group by day order by (day) desc
有分组分布分析SQL
with number(n,array_data) as (
SELECT
CAST(subscript AS INTEGER) subscrip, ARRAY[60,100,200] --数组分段区间
FROM (VALUES(sequence(2,4+1,1))) AS t1(array_subscript) -- 数组长度加一
CROSS JOIN UNNEST(array_subscript) AS t2(subscript)
), tmp1(day, g0, row_id, uid, idx) as (
select
e.day,
-- 分组指标
element_at(e.properties,'#mp_platform'),
-- 分组排序,用来保留前100个分组, 避免有些分组过大, 数据爆炸
row_number() over(partition by (e.uid, e.day) order by (element_at(e.properties,'#mp_platform')) desc),
e.uid,
-- 计算每个用户该字段的总和
sum(COALESCE(cast(element_at(e.properties,'scene') as DOUBLE),0))
from event.action e
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai')))) and e.event = 'ta_mp_show'
group by e.day, element_at(e.properties,'#mp_platform'), e.uid
),tmp2(day, g0, user,min_value,max_value,arr_value) as (
select
day, g0,
count(distinct uid),
min(idx),
max(idx),
array_agg(idx)
from tmp1
where row_id <= 100
group by day, g0
)
select
-- 分组字段
cast(day as varchar) "日期", g0,
max(user) "总人数",
array_agg(concat(
case when n-1=1 then '(-∞' else concat('[',cast(array_data[n-2] as varchar)) end,
',',
case when n-1=4 then '+∞)' else concat(cast(array_data[n-1] as varchar),')') end) -- d为数组长度
) gap_list,
array_agg(cardinality(filter(
arr_value, x -> IF(n-1=1,min_value-1,array_data[n-2]) <= x and x< IF(n-1<4,array_data[n-1],max_value+1) ))
) value_list
from tmp2
cross join number
where n-1 <= 4 -- 4为数组长度
group by day, g0 order by (day, g0) desc
漏斗分析
用户在多个事件之间的转换分析
无分组漏斗SQL
-- tmp0,tmp1,tmpx分别为漏斗事件顺序
with
tmp0 ( uid, t) as (
select
e.uid, e.time
from event.action e
where e.event = 'ta_mp_show' and e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
),tmp1 ( uid, t) as (
select
e.uid, e.time
from event.action e
where e.event = 'ta_mp_show' and e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
)
select
count(distinct e0.uid) s0,
count(distinct e1.uid) s1
from tmp0 e0
-- interval '1440' minute 代表两个事件发生时间不小于此时间 算有效漏斗
left join tmp1 e1 on e1.uid = e0.uid and e1.t between e0.t and (e0.t + interval '1440' minute)
有分组漏斗SQL
-- tmp0,tmp1,tmpx分别为漏斗事件顺序
with
tmp0 (g0, uid, t) as (
select
element_at(e.properties,'#mp_platform'), e.uid, e.time
from event.action e
where e.event = 'ta_mp_show' and e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
),
tmp1 (g0, uid, t) as (
select
element_at(e.properties,'#mp_platform'), e.uid, e.time
from event.action e
where e.event = 'ta_mp_show' and e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
)
select
e0.g0,
count(distinct e0.uid) s0, count(distinct e1.uid) s1
from tmp0 e0
-- interval '1440' minute 代表两个事件发生时间不小于此时间 算有效漏斗
left join tmp1 e1 on e1.uid = e0.uid and e1.t between e0.t and (e0.t + interval '1440' minute) and e0.g0 = e1.g0
group by e0.g0 order by (e0.g0) desc limit 100
路径分析
以某个事件为开始或结束在多个事件之间的转换分析
以某事件开头的路径分析
with tmp(uid, action, t) as (
select distinct
e.uid,
if(e.event='$register','用户注册', if(e.event='$login','用户登录', if(e.event='payment','内购', '未知' ))),
e.time
from event.action e
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
and e.event in ('$register', '$login', 'payment')
), tmp1(uid, action, t, row_id) as (
select
uid, action, t,
row_number() over(partition by uid order by (t,action))
from tmp
order by (uid, t, action)
), tmp2(uid, action, t, lead_action, lead_t, max_row_id) as (
select
uid, action, t,
lead(action, 1, '默认事件') over(partition by uid order by (t,action)),
lead(t, 1, cast('2999-12-31 23:59:59' as timestamp)) over(partition by uid order by (t,action)),
max(row_id) over(partition by uid) max_row_id
from tmp1
left join (values(1),(1)) t(rid) on row_id = rid
), tmp3(uid,path) as (
select uid,
filter(
reduce(
transform(
--按时间排序
zip(
array_sort(zip(array_agg(t),array_agg(action)),
(x, y) -> IF(cast(x[1] as timestamp) < cast(y[1] as timestamp),-1,
IF (cast(x[1] as timestamp) = cast(y[1] as timestamp), 0, 1))),
array_sort(zip(array_agg(lead_t),array_agg(lead_action)),
(x, y) -> IF(cast(x[1] as timestamp) < cast(y[1] as timestamp),-1,
IF (cast(x[1] as timestamp) = cast(y[1] as timestamp), 0, 1)))),
x -> ARRAY[
cast(x[2][1] as varchar), x[2][2],
IF(x[2][2] = '用户注册',
'0',
IF(cast(x[1][1] as timestamp) >= (cast(x[2][1] as timestamp) + interval '-1440' minute) and x[1][2]<>x[2][2] and x[2][2] <> '默认事件',
'1',
'-1')
)
]
),
-- 三维数组
array[array[array['2020-10-01 10:00:00.000', '默认事件', '-1']]],
(state, value) -> case
when value[3] = '0' or value[3] = '-1' then
concat(state, array[value])
when value[3] = '1' then
concat(slice(state, 1, cardinality(state) - 1), array[concat(slice(state, -1, 1)[1], array[value])])
end,
state -> slice(state, 2, cardinality(state) - 1)
),
-- 过滤掉开头或结尾不为该事件的
x -> x[1][3] = '0' and cardinality(x) > 1
) path
from tmp2
where max_row_id>1
group by uid
), result as(
SELECT
uid, replace(ltrim(regexp_replace(str,'([0-9]*)(\p{Punct}*)','')),' ','->') path
FROM tmp3
CROSS JOIN UNNEST(SPLIT(json_format(cast(path as json)) ,']],[[')) t(str)
)
select
path,
count(*) over() all_path_count,
count(*) path_count
from result
where path<>'用户注册' and path<>''
group by path
order by (path_count, path) desc
limit 100
以某事件结尾的路径分析
with tmp(uid, action, t) as (
select distinct
e.uid,
reverse(if(e.event='$register','用户注册', if(e.event='$login','用户登录', if(e.event='payment','内购', '未知' )))),
e.time
from event.action e
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
and e.event in ('$register', '$login', 'payment')
),tmp1(uid, action, t, row_id) as (
select
uid, action, t,
row_number() over(partition by uid order by (t,action) desc)
from tmp
),tmp2(uid, action, t, lead_action, lead_t, max_row_id) as (
select
uid, action, t,
lead(action, 1, '默认事件') over(partition by uid order by (t,action) desc),
lead(t, 1, cast('1970-01-01 00:00:00' as timestamp)) over(partition by uid order by (t,action) desc),
max(row_id) over(partition by uid) max_row_id
from tmp1
left join (values(1),(1)) t(rid) on row_id = rid
),tmp3(uid,path) as (
select uid,
filter(
reduce(
transform(
--按时间排序
zip(
array_sort(zip(array_agg(t),array_agg(action)),
(x, y) -> IF(cast(x[1] as timestamp) > cast(y[1] as timestamp),-1,
IF (cast(x[1] as timestamp) = cast(y[1] as timestamp), 0, 1))),
array_sort(zip(array_agg(lead_t),array_agg(lead_action)),
(x, y) -> IF(cast(x[1] as timestamp) > cast(y[1] as timestamp),-1,
IF (cast(x[1] as timestamp) = cast(y[1] as timestamp), 0, 1)))),
x -> ARRAY[
cast(x[2][1] as varchar), x[2][2],
IF(x[2][2] = reverse('内购'),
'0',
IF(cast(x[1][1] as timestamp) <= (cast(x[2][1] as timestamp) + interval '1440' minute) and x[1][2]<>x[2][2] and x[2][2] <> '默认事件',
'1',
'-1')
)
]
),
-- 三维数组
array[array[array['2020-10-01 10:00:00.000', '默认事件', '-1']]],
(state, value) -> case
when value[3] = '0' or value[3] = '-1' then
concat(state, array[value])
when value[3] = '1' then
concat(slice(state, 1, cardinality(state) - 1), array[concat(slice(state, -1, 1)[1], array[value])])
end,
state -> slice(state, 2, cardinality(state) - 1)
),
-- 过滤掉开头或结尾不为该事件的
x -> x[1][3] = '0' and cardinality(x) > 1
) path
from tmp2
where max_row_id>1
group by uid
), result as(
SELECT
uid,replace(ltrim(regexp_replace(str,'([0-9]*)(\p{Punct}*)','')),' ','>-') path
FROM tmp3
CROSS JOIN UNNEST(SPLIT(json_format(cast(path as json)) ,']],[[')) t(str)
)
select
reverse(path) path,
count(*) over() all_path_count,
count(*) path_count
from result
where path<>reverse('内购') and path<>''
group by path
order by (path_count, path) desc
limit 100
事件明细分析
查看和多维度筛选事件日志
select
e.uid "账号ID",
e.uid_type "账号类型",
element_at(e.properties,'#mp_platform') "小程序平台",
element_at(e.properties,'#os') "操作系统",
cast(element_at(e.properties,'#zone_offset') as DOUBLE) "时区偏移",
cast(element_at(e.properties,'life_time') as INTEGER) "生命周期天数",
cast(element_at(e.properties,'scene') as DOUBLE) "场景值",
format_datetime(e.time, 'yyyy-MM-dd HH:mm:ss') "时间",
e.timezone "时区",
format_datetime(e.timestamp, 'yyyy-MM-dd HH:mm:ss') "采集时间"
from event.action e
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day >= date_add('day',-2, date(now() AT Time Zone 'Asia/Shanghai')) and e.minute <= format_datetime(date_add('minute', -1, now() AT Time Zone 'Asia/Shanghai'),'yyyy-MM-dd HH:mm') and e.event = 'ta_mp_show'
and (element_at(e.properties,'#mp_platform') = '平台1') -- 筛选条件
order by (e.time) desc
limit 100
SQL分析
支持对全量数据的SQL查询和可视化,在通用的分析模型不支持的时候, 可以使用SQL制作复杂报表
UDF函数
标量函数
除法函数
除0特殊处理, 除法保留2位小数, 支持不同数值类型相除
package scalarFunctions;
import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.spi.function.Description;
import com.facebook.presto.spi.function.ScalarFunction;
import com.facebook.presto.spi.function.SqlType;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import static com.facebook.presto.common.type.StandardTypes.DOUBLE;
import static com.facebook.presto.common.type.StandardTypes.INTEGER;
public class avgPrestoFunction {
@ScalarFunction("divide")//函数名称
@Description("(double) (a/b)")//函数描述
@SqlType(DOUBLE)//函数返回类型
public static double avg(@SqlType(INTEGER) long numerator, // 分子
@SqlType(INTEGER) long denominator) {// 分母
if (denominator == 0){
return Double.parseDouble("0.0");
}
double l = (double) numerator / denominator;
String result = String.format("%.2f",l);
return Double.parseDouble(result);
}
@ScalarFunction("divide")//函数名称
@Description("(double) (a/b)")//函数描述
@SqlType(DOUBLE)//函数返回类型
public static double avg(@SqlType(INTEGER) long numerator, // 分子
@SqlType(DOUBLE) double denominator) {// 分母
if (denominator == 0){
return Double.parseDouble("0.0");
}
double l = numerator / denominator;
String result = String.format("%.2f",l);
return Double.parseDouble(result);
}
@ScalarFunction("divide")//函数名称
@Description("(double) (a/b)")//函数描述
@SqlType(DOUBLE)//函数返回类型
public static double avg(@SqlType(DOUBLE) double numerator, // 分子
@SqlType(INTEGER) long denominator) {// 分母
if (denominator == 0){
return Double.parseDouble("0.0");
}
double l = numerator / denominator;
String result = String.format("%.2f",l);
return Double.parseDouble(result);
}
@ScalarFunction("divide")//函数名称
@Description("(double) (a/b)")//函数描述
@SqlType(DOUBLE)//函数返回类型
public static double avg(@SqlType(DOUBLE) double numerator, // 分子
@SqlType(DOUBLE) double denominator) {// 分母
if (denominator == 0){
return Double.parseDouble("0.0");
}
double l = numerator / denominator;
String result = String.format("%.2f",l);
return Double.parseDouble(result);
}
}
分段函数
将当前数据划分到某个区间端, 用于分布分析模型当中
package scalarFunctions;
import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.spi.function.Description;
import com.facebook.presto.spi.function.ScalarFunction;
import com.facebook.presto.spi.function.SqlType;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import java.text.SimpleDateFormat;
import java.util.Date;
public class toIntervalFunction {
@ScalarFunction("toIntervalUDF")//函数名称
@Description("Number and spacing of specified segments, int to Interval function")//函数描述
@SqlType(StandardTypes.VARCHAR)//函数返回类型
public static Slice toInterval(@SqlType(StandardTypes.BIGINT) long input, // 传入的总数
@SqlType(StandardTypes.BIGINT) long n, // 指定段数
@SqlType(StandardTypes.BIGINT) long gap) { // 指定区间间隔
String interval = "";
if (input <= gap) {
interval = "(-∞," + gap +")";
} else {
int i = (int) (input / gap);
if (i == 0) {
interval = "(-∞," + gap +")";
} else if (i < n) {
interval = "[" + i * gap + "," + (i+1) * gap + ")";
} else {
interval = "[" + n * gap + "," + "+∞)";
}
}
return Slices.utf8Slice(interval);
}
}
聚合函数
分段聚合函数
统计出该列的最小值,最大值,按照指定段数进行分段
package aggregationFunctions;
import com.facebook.presto.common.block.BlockBuilder;
import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.common.type.VarcharType;
import com.facebook.presto.spi.function.*;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import static com.facebook.presto.common.type.StandardTypes.VARCHAR;
@AggregationFunction("toIntervalUDAF")//函数名称
@Description("参数:(聚合操作名,聚合值,段数,区间间隔) 指定聚合操作进行聚合,将聚合结果转为指定区间范围")
public class specialIntervalFunction {
// 累计状态接口
public interface specialState extends AccumulatorState {
// 最大值
double getMax();
void setMax(double value);
// 最小值
double getMin();
void setMin(double value);
// 总和
double getSum();
void setSum(double sum);
// 段数
long getNum();
void setNum(long num);
// 区间间隔
long getGap();
void setGap(long gap);
// 总个数
long getCount();
void setCount(long value);
// 聚合操作类型
Slice getOperation();
void setOperation(Slice operation);
}
@InputFunction
public static void input(specialState state,
@SqlType(VARCHAR) Slice operation, // 聚合操作名: sum, max, min, avg
@SqlType(StandardTypes.BIGINT) long input, // 需要聚合的数值
@SqlType(StandardTypes.BIGINT) long n, // 指定段数
@SqlType(StandardTypes.BIGINT) long gap) { // 指定区间间隔
// 第一次初始化state
if (state.getCount() <= 0) {
double value = Double.parseDouble(String.valueOf(input));
// 初始化聚合操作类型
state.setOperation(operation);
// 初始化段数
state.setNum(n);
// 初始化最大值
state.setMax(value);
// 初始化最小值
state.setMin(value);
// 初始化区间间隔
state.setGap(gap);
// 统计总和
state.setSum(state.getSum() + value);
// 计数+1
state.setCount(state.getCount() + 1);
} else {// 非第一次
double value = Double.parseDouble(String.valueOf(input));
if (state.getOperation().toStringUtf8().toLowerCase().equals("max")) {
// 记录最大值
if (state.getMax() < value) state.setMax(value);
} else if (state.getOperation().toStringUtf8().toLowerCase().equals("min")) {
// 记录最小值
if (state.getMin() > value) state.setMin(value);
} else {
// 统计总和 --sum和avg都需要求总和
state.setSum(state.getSum() + value);
}
// 计数+1
state.setCount(state.getCount() + 1);
}
}
@InputFunction
public static void input2(specialState state,
@SqlType(VARCHAR) Slice operation, // 聚合操作名: sum, max, min, avg
@SqlType(StandardTypes.INTEGER) long input, // 需要聚合的数值
@SqlType(StandardTypes.BIGINT) long n, // 指定段数
@SqlType(StandardTypes.BIGINT) long gap) { // 指定区间间隔
// 第一次初始化state
if (state.getCount() <= 0) {
double value = Double.parseDouble(String.valueOf(input));
// 初始化聚合操作类型
state.setOperation(operation);
// 初始化段数
state.setNum(n);
// 初始化最大值
state.setMax(value);
// 初始化最小值
state.setMin(value);
// 初始化区间间隔
state.setGap(gap);
// 统计总和
state.setSum(state.getSum() + value);
// 计数+1
state.setCount(state.getCount() + 1);
} else {// 非第一次
double value = Double.parseDouble(String.valueOf(input));
if (state.getOperation().toStringUtf8().toLowerCase().equals("max")) {
// 记录最大值
if (state.getMax() < value) state.setMax(value);
} else if (state.getOperation().toStringUtf8().toLowerCase().equals("min")) {
// 记录最小值
if (state.getMin() > value) state.setMin(value);
} else {
// 统计总和 --sum和avg都需要求总和
state.setSum(state.getSum() + value);
}
// 计数+1
state.setCount(state.getCount() + 1);
}
}
@InputFunction
public static void input(specialState state,
@SqlType(VARCHAR) Slice operation, // 聚合操作名: sum, max, min, avg
@SqlType(StandardTypes.DOUBLE) double input, // 需要聚合的数值
@SqlType(StandardTypes.BIGINT) long n, // 指定段数
@SqlType(StandardTypes.BIGINT) long gap) { // 指定区间间隔
// 第一次初始化state
if (state.getCount() <= 0) {
// 初始化聚合操作类型
state.setOperation(operation);
// 初始化段数
state.setNum(n);
// 初始化最大值
state.setMax(input);
// 初始化最小值
state.setMin(input);
// 初始化区间间隔
state.setGap(gap);
// 统计总和
state.setSum(state.getSum() + input);
// 计数+1
state.setCount(state.getCount() + 1);
} else {// 非第一次
if (state.getOperation().toStringUtf8().toLowerCase().equals("max")) {
// 记录最大值
if (state.getMax() < input) state.setMax(input);
} else if (state.getOperation().toStringUtf8().toLowerCase().equals("min")) {
// 记录最小值
if (state.getMin() > input) state.setMin(input);
} else {
// 统计总和 --sum和avg都需要求总和
state.setSum(state.getSum() + input);
}
// 计数+1
state.setCount(state.getCount() + 1);
}
}
@CombineFunction
public static void combine(specialState state, specialState otherState) {
if (state.getCount() != 0) {
state.setCount(state.getCount() + otherState.getCount());//合并个数
state.setMax(Math.max(state.getMax(), otherState.getMax()));//合并最大值
state.setMin(Math.min(state.getMin(), otherState.getMin()));//合并最小值
state.setSum(state.getSum() + otherState.getSum());//合并总和
state.setNum(state.getNum());//合并段数:段数都一样
state.setGap(state.getGap());//合并区间间隔: 区间间隔都一样
state.setOperation(state.getOperation());//合并操作类型: 操作类型都一样
} else {
state.setCount(otherState.getCount());//个数
state.setMax(otherState.getMax());//最大值
state.setMin(otherState.getMin());//最小值
state.setSum(otherState.getSum());//总和
state.setNum(otherState.getNum());//段数
state.setGap(otherState.getGap());//区间间隔
state.setOperation(otherState.getOperation());//操作类型
}
}
@OutputFunction(StandardTypes.VARCHAR)
public static void output(specialState state, BlockBuilder out) {
String operation = state.getOperation().toStringUtf8();//操作类型
long n = state.getNum();//段数
long gap = state.getGap();//区间
long count = state.getCount();//个数
double max = state.getMax();//最大值
double min = state.getMin();//最小值
double sum = state.getSum();//总和
// 没有数据返回null
if (count <= 0) {
out.appendNull();
return;
}
double value;
if (operation.toLowerCase().equals("avg")) {
double v = sum / count;
value = v;
} else if (operation.toLowerCase().equals("max")) {
value = max;
} else if (operation.toLowerCase().equals("min")) {
value = min;
} else {
value = sum;
}
String interval = "";
if (value < gap) {
interval = "(-∞," + gap + ")";
} else {
int i = (int) (value / gap);
if (i == 0) {
interval = "(-∞," + gap + ")";
} else if (i < n) {
interval = "[" + i * gap + "," + (i + 1) * gap + ")";
} else {
interval = "[" + n * gap + "," + "+∞)";
}
}
Slice slice = Slices.utf8Slice(interval);
VarcharType.VARCHAR.writeSlice(out, slice);
}
}
Presto自定义函数部署
将jar放在presto插件位置/usr/lib/presto/plugin, 重启presto即可
系列文章
第一篇: Ambari自动化部署
第二篇: 数据埋点设计和SDK源码
第三篇: 数据采集和验证方案
第四篇: ETL实时方案: Kafka->Flink->Hive
第五篇: ETL用户数据处理: kafka->spark->kudu
第六篇: Presto分析模型SQL和UDF函数
第七篇: 用户画像和留存预测模型