数据源
hive中的adventure_ods库的ods_sales_orders表
use adventure_ods;
select * from ods_sales_orders limit 5;
一:每个用户截止到每月为止的最大交易金额和累计到该月的总交易金额
select customer_key,substr(create_date,1,7) as umonth,count(1) as ucount,sum(unit_price) as usum
from ods_sales_orders
group by customer_key,substr(create_date,1,7)
limit 10;
select *,max(usum) over(partition by customer_key order by umonth) as current_max,
sum(usum) over(partition by customer_key order by umonth) as current_sum
from (select customer_key,substr(create_date,1,7) as umonth,count(1) as ucount,sum(unit_price) as usum
from ods_sales_orders
group by customer_key,substr(create_date,1,7)) as a
limit 10;
二:计算月份的回购率和复购率
复购率&回购率:
select a.umonth,concat(round(count(b.umonth)*100/count(1),2),'%') as `回购率(%)`,concat(round(sum(if(a.order_num>1,1,0))*100/count(1),2),'%') as `复购率(%)` from
(select customer_key,substr(create_date,1,7) as umonth,count(1) as order_num
from ods_sales_orders
group by customer_key,substr(create_date,1,7)) as a
left join
(select customer_key,substr(create_date,1,7) as umonth
from ods_sales_orders
group by customer_key,substr(create_date,1,7)) as b
on a.customer_key=b.customer_key and substring(a.umonth,6,2)=substring(b.umonth,6,2)-1
group by a.umonth;
三:求用户号对应不同的产品
with tmp as (
select * from (
select customer_key,cpzl_zw,row_number() over(partition by customer_key order by create_date asc) as `排序1`,lag(cpzl_zw,1,null) over(partition by customer_key order by create_date asc) as cpzl_zw1
from ods_sales_orders) as a
where cpzl_zw!=cpzl_zw1
),
tmp2 as (
select *,row_number() over(partition by customer_key order by `排序1` ) as `排序2`
from tmp
)
select concat(customer_key,'-',concat_ws('-',collect_set(cpzl_zw)))
from tmp2
where `排序2`<3
group by customer_key;
-- collect_set()必须要用group by
四:统计各个省份所属城市下最受欢迎的Top 3 产品和其销量(不能出现null)
select * from
(select chinese_city,english_product_name,count(1) as sales,row_number() over(partition by chinese_city order by count(1) desc) as `排序`
from ods_sales_orders as a
inner join
(select customer_key,chinese_city
from ods_customer where chinese_city!='null') as b
on a.customer_key=b.customer_key
group by chinese_city,english_product_name) as c
where `排序`<=3;
五:商品的销售数量top10,排名需考虑并列排名的情况
-- dense_rank()连续排序(1,1,2)
select * from
(select english_product_name,count(1) as sales,dense_rank() over(order by count(1) desc) as `排序`
from ods_sales_orders
group by english_product_name) as a
where a.`排序`<=10;
六:计算累加和(统计2019年1-12月的累积销量,即1月为1月份的值,2月为1、2月份值的和,3月为1、2、3月份的和,12月为1-12月份值的和)
select substr(create_date,1,7) as umonth,count(1) as sales,sum(count(1)) over(order by substr(create_date,1,7)) as `累积销量` from
ods_sales_orders
where year(create_date)='2019'
group by substr(create_date,1,7);
七:计算客户平均购买一次商品的间隔时间
select customer_key,avg(time_interval) as avg_buy_period from
(select customer_key,datediff(lead(create_date,1,'2019-01-01') over(partition by customer_key order by create_date),create_date) as time_interval
from ods_sales_orders) as a
where time_interval>0
group by customer_key;
注解:
和lag()over()相反。
lead()over():
LEAD(col,n,DEFAULT) 用于统计窗口内往下第n行值,第一个参数为列名,第二个参数为往下第n行(可选,默认为1),第三个参数为默认值(当往下第n行为NULL时候,取默认值,如不指定,则为NULL)所以使用LEAD (时间,1),取客户的时间,按照客户编号分组再按时间排序,所以使用over(partition by 客户编号 order by 时间)
八:查询最近前20%时间的订单信息
select * from
(select cplb_zw,cpzl_zw,unit_price,create_date,ntile(5) over(order by create_date desc) as stored
from ods_sales_orders) as a
where stored=1;