1、现有交易数据表user_goods_table,
老板想知道每个用户购买的外卖品类偏好分布,并找出每个用户购买最多的外卖品类是哪个。
方法一:
SELECT
user_name
,goods_kind
FROM
(
SELECT
user_name
,goods_kind
,cn
,rank() over( partition by user_name order by cn desc) rn
FROM
(
SELECT
user_name
,goods_kind
,count(*) as cn
FROM user_goods_table
GROUP BY user_name,goods_kind
)a
)b
WHERE b.rn=1
-- 排序可以直接在第一个SELECT里进行,这样就只需要一个子查询了
方法二:
SELECT
user_name
,goods_kind
FROM
(
SELECT
user_name
,goods_kind
,count(*) as cn
,rank() over( partition by user_name order by count(*) desc) rn
FROM user_goods_table
GROUP BY user_name,goods_kind
)a
WHERE a.rn=1
2、现有交易数据表user_sales_table,
老板想知道支付金额在前20%的用户。
SELECT
user_name
,pay_amount
,nt
FROM
(
SELECT
user_name
,pay_amount
,ntile(5) over(ORDER BY Cast(pay_amount as SIGNED INT) DESC) nt
FROM user_sales_table
)a
WHERE nt=1
3、现有用户登录表user_login_table,
老板想知道连续7天都登录平台的重要用户。
方法一(先排序,然后相减并进行分组计数):
SELECT
user_name
,DATE_SUB(cast(date as date),interval rn day) ds
,count(DATE_SUB(cast(date as date),interval rn day)) ct
FROM
(
SELECT
user_name
,date
,row_number() over(partition by user_name order by cast(date as date) asc) rn
FROM user_login_table
)a
GROUP BY user_name,ds
HAVING count(DATE_SUB(cast(date as date),interval rn day))>=7
方法二(使用lead函数进行偏移计算,然后使用date_sub或date_add进行日期加减作为筛选条件):
SELECT
DISTINCT user_name
FROM
(
SELECT
user_name
,date
,LEAD(date,6) OVER(partition by user_name order by cast(date as date) asc) lead_date
FROM user_login_table
)a
WHERE date_add(cast(date as date),interval 6 day)=cast(lead_date as date)
4、给定一张用户签到表user_attendence,表中包含三个字段,分别是用户ID:【user_id】,日期:【date】,是否签到:【is_sign_in】,0否1是。
4-1、计算截至当前(假设当前时间为2020-04-27),每个用户已经连续签到的天数:
要求输出用户ID【user_id】和连续签到天数【recent_continuous_days】
先计算最近一次没有签到的日期,然后用最近的日期减去最近一次没有签到的日期,就可以得到最近连续签到的天数
SELECT
userid
,MAX(date)
,datediff("2020-04-27",MAX(date)) recent_continuous_days
FROM user_attendence
WHERE is_sign=0
GROUP BY userid
4-2、计算有史以来用户最大连续签到天数:
要求输出用户ID【user_id】和最大连续签到天数
方法一(先进行排序,然后使用日期与排序相减并对相减之后的日期进行分组排序计数,就可以得出连续登录的次数,然后再进行排序取最大值):
SELECT
userid
,ct
FROM
(
SELECT
userid
,DATE_SUB(date,INTERVAL rn day) ds
,COUNT(DATE_SUB(date,INTERVAL rn day)) ct -- count括号里可以直接用*代替
,rank() over(PARTITION by userid ORDER BY COUNT(DATE_SUB(date,INTERVAL rn day)) DESC) rn
-- 这个可以不用窗口函数,直接在外面使用MAX(COUNT(*))就可以取得最大值
FROM
(
SELECT
userid
,date
,row_number() over(partition by userid order by date asc ) rn
FROM user_attendence
WHERE is_sign=1
)a
GROUP BY userid,ds
)b
WHERE rn=1
方法二((先进行排序,然后使用日期与排序相减并对相减之后的日期进行分组排序计数,就可以得出连续登录的次数,然后直接用MAX函数取最大值):
SELECT b.userid
, MAX(b.continues_day) as max_continuous_days
FROM
(SELECT
a.userid
, a.date-a.rn AS difference
, COUNT(*) AS continues_day
FROM
(SELECT
userid,
date,
ROW_NUMBER() OVER (PARTITION BY userid ORDER BY date) AS rn
FROM user_attendence
WHERE is_sign = 1) AS a
GROUP BY a.userid, difference
) AS b
GROUP BY b.userid