日志数据 test.action_dtl_ld
1、步骤定义
第一步: eventid = 'e1' and properties['p1'] = 'v1'
第二步: eventid = 'e4' and properties['p5'] = 'vx'
第三步: eventid = 'e6' and properties['p2'] = 'v2'
2、步骤号计算
select
guid,
eventid,
properties ,
ts,
case
-- 前边添加一个ld 是为了正则表达式好抽取
when eventid = 'e1'
and properties['p1'] = 'v1' then 'ld1'
when eventid = 'e4'
and properties['p5'] = 'vx' then 'ld2'
when eventid = 'e6'
and properties['p2'] = 'v2' then 'ld3'
else null
end as orderid
from
test.action_dtl_ld
3、步骤号计算结果
4、步骤正则编码计算
1、根据步骤号计算结果,过滤掉orderid是null的
2、concat(ts,'_',orderid)将ts与orderid拼接起来(拼接ts放在前边用来排序,sort_array)
3、collect_list(_),将ts与orderid拼接起来的结果收集到list中
4、sort_array(_),将list中的数据进行排序,因为前边拼接了ts所以会根据时间先后排序
5、concat_ws(_), 将list中排好序的数据,转成字符串,as Regstr
concat_ws(',',sort_array(collect_list(concat(ts,'_',orderid)))) as regstr
with t_orderid as (
select
guid,
eventid,
properties ,
ts,
case
when eventid = 'e1'
and properties['p1'] = 'v1' then 'ld1'
when eventid = 'e4'
and properties['p5'] = 'vx' then 'ld2'
when eventid = 'e6'
and properties['p2'] = 'v2' then 'ld3'
else null
end as orderid
from
test.action_dtl_ld
)
select
guid,
concat_ws(',',sort_array(collect_list(concat(ts,'_',orderid)))) as regstr
from
t_orderid
where
orderid is not null
group by guid
5、步骤正则编码计算结果
6、利用正则表达式判断上述结果
regexp_extract(subject,parttern,inedx)
解释:
--subject 是被解析的字符串
--pattern 是正则表达式
--index 是返回结果,取表达式的哪一部,“默认值为1”。
----0表示把整个正则表达式对应的结果全部返回;
----1表示返回正则表达式中第一个() 对应的结果 以此类推。
----注意点:
----要注意的是idx的数字不能大于表达式中()的个数。
----否则报错
select REGEXP_EXTRACT('3_ld1,4_ld1,5_ld2,7_ld3','.*?(ld1).*?(ld2).*?(ld3).*?',3)
----ld3
7、 计算语句
7、1 计算每个人完成的最大漏斗步数
with t_orderid as (
select
guid,
eventid,
properties ,
ts,
case
when eventid = 'e1'
and properties['p1'] = 'v1' then 'ld1'
when eventid = 'e4'
and properties['p5'] = 'vx' then 'ld2'
when eventid = 'e6'
and properties['p2'] = 'v2' then 'ld3'
else null
end as orderid
from
test.action_dtl_ld
),
regstr_res as (
select
guid,
concat_ws(',',
sort_array(collect_list(concat(ts, '_', orderid)))) as regstr
from
t_orderid
where
orderid is not null
group by
guid
)
-- 计算每个人完成的最大漏斗步数
select
guid,
case
when REGEXP_EXTRACT(regstr,'.*?(ld1).*?(ld2).*?(ld3).*?',3) = 'ld3' then 3
when REGEXP_EXTRACT(regstr,'.*?(ld1).*?(ld2).*?',2) = 'ld2' then 2
when REGEXP_EXTRACT(regstr,'.*?(ld1).*?',1) = 'ld2' then 1
else 0
end as stepmax
from
regstr_res
结果1
7、2 计算每个人完成的最大漏斗步数
with t_orderid as (
select
guid,
eventid,
properties ,
ts,
case
when eventid = 'e1'
and properties['p1'] = 'v1' then 'ld1'
when eventid = 'e4'
and properties['p5'] = 'vx' then 'ld2'
when eventid = 'e6'
and properties['p2'] = 'v2' then 'ld3'
else null
end as orderid
from
test.action_dtl_ld
),
regstr_res as (
select
guid,
concat_ws(',',
sort_array(collect_list(concat(ts, '_', orderid)))) as regstr
from
t_orderid
where
orderid is not null
group by
guid
),
step_nums as (
select
guid,
case
when REGEXP_EXTRACT(regstr, '.*?(ld1).*?(ld2).*?(ld3).*?', 3) = 'ld3' then 3
when REGEXP_EXTRACT(regstr, '.*?(ld1).*?(ld2).*?', 2) = 'ld2' then 2
when REGEXP_EXTRACT(regstr, '.*?(ld1).*?', 1) = 'ld2' then 1
else 0
end as stepmax
from
regstr_res
)
-- 计算各个步数的人数
select
count(if(stepmax>0,1,null)) as step1,
count(if(stepmax>1,1,null)) as step2,
count(if(stepmax>2,1,null)) as step3
from
step_nums
结果2
8、结论
1、用户g01完成2步、用户g02完成3步
2、步骤1有2人完成,步骤2有两人完成,步骤3有两人完成