3、去过xx的人也喜欢yy
计算口径:所有账户(username,过滤大单用户账号)取最近5个订单,求某个账号任意2个航线的共现次数,除以2个航线单独出现次数的乘积,得到结果作为相关性(百分比)
如
用户A: a-b a-c
用户B :a-b
用户C: a-c
则用户A,a-b与a-c相关性=1/2*2=25%
Key:username
Value:两个相关航线-相关性百分比
如:a-b&a-c&25%
drop table if exists tmp_data;
create table tmp_data as
select
username,
air_line
from(
select
username,
air_line,
create_time,
row_number() over(partition by username order by create_time) ranks
from(
select
qunar_username,
concat(act_dep_city,'-',act_arr_city) air_line,
max(create_time) as create_time
from f_wide.wide_order
where dt>='$QDATE(-730,'yyyyMMdd')' and pay_ok=1 and dom_inter=0
group by username,concat(act_dep_city,'-',act_arr_city)
)A
)A where ranks<=5;
insert overwrite table user_air_line_similar partition(dt)
select
username,
concat_ws(',',udf.collect_list_sort(concat(B.air_line,'&',couple_air_line,'&',rate),rn)),
'$QDATE(0,'yyyyMMdd')' dt
from tmp_data A
join(
select
air_line,
couple_air_line,
round(couple_num*1.0/(a_airline_num*b_airline_num),4) as rate,
row_number() over(partition by air_line order by couple_num*1.0/(a_airline_num*b_airline_num) desc ) as rn --取出所有包含A航线的,的组合方式,并排名
from(
select
A.air_line as air_line,
b_air_line as couple_air_line,
a_airline_num,
b_airline_num,
count(distinct username) as couple_num--针对每一种航线组合,对用户名去重后,取总的组合次数
from(
select
A.username,
A.air_line air_line,
B.air_line b_air_line,
count(distinct A.username) over (partition by A.air_line) as a_airline_num,--针对每一个航线对用户名去重,然后取出每一个航线总的售卖量
count(distinct A.username) over (partition by B.air_line) as b_airline_num
from tmp_data A
join tmp_data B on (A.username=B.username)
where split(A.air_line,'-')[0] = split(B.air_line,'-')[0] and split(A.air_line,'-')[1] <> split(B.air_line,'-')[1]
)A
group by
A.air_line,
b_air_line,
a_airline_num,
b_airline_num
)A
)B on (A.air_line=B.air_line and B.rn<=20)--A航线的组合方式以及排名已经出来只取前二十,用A表中的航线跟B中的航线组合中的一个航线进行join,就得到A表中的航线跟其他航线的组合的概率了。
group by A.username
distribute by dt;