相关精彩专题链接: 数据成就更好的你
一、项目需求
本案例的数据为小程序运营数据,以行业常见指标对用户行为进行分析,包括UV、PV、新增用户分析、留存分析、复购分析等内容。
项目需求如下:
1.日访问量分析,并观察其走势
2.不同行为类型的访问量分析
3.一天中不同时间段的访问量分析(时间段按小时划分)
4.每日新增用户情况分析
5.用户留存分析
6.复购分析
7.商品排行榜分析
8.利用sqoop将数据分析结果导入mysql存储
二、数据介绍
1.用户行为信息表
2.查看具体的数据格式
a.用户信息:head -n 3 behavior.txt
b.去除首行,首行为标题行,hive导入数据时不需要此行:
sed -i "1d" behavior.txt
三、创建表
创建用户行为表(需结合数据格式)
#创建用户行为表
create table if not exists behavior(
user_id int comment "用户id",
goods_id int comment "商品id",
cat int comment "商品类别id",
behavior string comment "用户行为",
time1 Timestamp comment "访问时间",
addr string comment "地域",
price double comment "单价",
amount double comment "数量")
row format delimited
fields terminated by "\t"
lines terminated by "\n";
#向表中导入数据
load data local inpath "/home/software/hive/data/behavior.txt" into table behavior;
四、用户行为分析:pv/uv
1.日访问量分析,并观察其走势
select date(time1) date1,count(1) pv, count(distinct(user_id)) uv from behavior group by date(time1);
2.不同行为类型的访问量分析
select
date(time1) date1,
sum(if(behavior="buy",1,0)) buy_pv, count(distinct(if(behavior="buy",user_id,null))) buy_uv,
sum(if(behavior="cart",1,0)) cart_pv, count(distinct(if(behavior="cart",user_id,null))) cart_uv,
sum(if(behavior="fav",1,0)) fav_pv, count(distinct(if(behavior="fav",user_id,null))) fav_uv,
sum(if(behavior="pv",1,0)) view_pv, count(distinct(if(behavior="pv",user_id,null))) view_uv
from behavior group by date(time1);
3.一天中不同时间段的访问量分析(时间段按小时划分)
select date(time1) date1,hour(time1) hour1,count(1) pv,count(distinct(user_id)) uv from behavior group by date(time1),hour(time1);
五、获客分析
获客分析:观察每日新增用户情况。新用户的定义:第一次访问网站
select a.date1,count(1) new_visitor
from (select user_id,min(date(time1)) date1 from behavior group by user_id)a
group by a.date1;
六、用户留存分析
留存定义:
1月1日,新增用户200人;
次日留存:第2天,1月2日,这200人里面有100人活跃,则次日留存率为:100 / 200 = 50%
2日留存:第3天,1月3日,这200名新增用户里面有80人活跃, 第3日新增留存率为:80/200 = 40%; 以此类推
#计算次日留存和3日留存
select
a.date1,
count(distinct(a.user_id)) new_user_num,
count(distinct(if(abs(datediff(a.date1,date(b.time1)))=1,b.user_id,null))) retention_one_num,
concat(round(count(distinct(if(abs(datediff(a.date1,date(b.time1)))=1,b.user_id,null)))*100/count(distinct(a.user_id)),2),"%") retention_one_rate,
count(distinct(if(abs(datediff(a.date1,date(b.time1)))=3,b.user_id,null))) retention_three_num,
concat(round(count(distinct(if(abs(datediff(a.date1,date(b.time1)))=3,b.user_id,null)))*100/count(distinct(a.user_id)),2),"%") retention_three_rate
from
(select user_id,min(date(time1)) date1 from behavior group by user_id)a
left join behavior b
on a.user_id=b.user_id
group by a.date1
留存分析结果如下:
例:2019-11-28日的新增7610个用户,次日这些新增用户有6026个再次访问网页,留存率为79.19%,第4天,有5980个用户再次访问,留存率为78.58%
七、复购分析
指在单位时间段内,重复购买率=再次购买人数/总购买人数。
例如在一个月内,有100个客户成交,其中有20个是回头客,则重复购买率为20%。
此处的回头客定义为:按天去重,即一个客户一天产生多笔交易付款,则算一次购买,除非在统计周期内另外一天也有购买的客户才是回头客。
1.用户的购买次数统计
create table user_buy as
select m.user_id as user_id,count(1) as num
from
(select user_id,date(time1) date1 from behavior where behavior="buy" group by user_id,date(time1))m
group by m.user_id
2.复购率计算
select concat(round(count(if(num>1,user_id,null))*100/count(1),2),"%") rebuy_rate from user_buy
#本案列的复购率为42.06%
八、商品排行榜信息
1.商品的销售数量top10,排名需考虑并列排名的情况
select * from
(select a.goods_id as goods_id,a.sale_amount as sale_amount,dense_rank() over(order by a.sale_amount desc ) as rank1
from
(select goods_id,sum(amount) sale_amount from behavior where behavior="buy" group by goods_id)a
)b
where b.rank1<=10;
2.商品的浏览次数top10,排名需考虑并列排名的情况
select * from
(select a.goods_id as goods_id,a.pv as pv,dense_rank() over(order by a.pv desc ) as rank1
from
(select goods_id,count(1) pv from behavior where behavior="pv" group by goods_id)a
)b
where b.rank1<=10;
3.商品的收藏次数top10,排名需考虑并列排名的情况
select * from
(select a.goods_id as goods_id,a.fav as fav,dense_rank() over(order by a.fav desc ) as rank1
from
(select goods_id,count(1) fav from behavior where behavior="fav" group by goods_id)a
)b
where b.rank1<=10;
4.城市购买力排名
create table city_rank as
select addr as city ,round(sum(price*amount),2) as money from behavior group by addr
order by money desc;
select * from city_rank;
九、利用sqoop将数据分析结果导入mysql存储
1.在mysql创建一张表,字段类型、顺序都和hive中的表一样
create table city_rank (city varchar(20), money double);
2.测试sqoop连接mysql是否成功
sqoop list-tables --connect jdbc:mysql://localhost:3306/ljh --username root --password root
#结果可以看到ljh数据库下的表信息,所以连接成功
3.利用sqoop将数据分析结果导入mysql存储
sqoop export --connect jdbc:mysql://localhost:3306/ljh --username root --password root --table city_rank --fields-terminated-by '\001' --export-dir '/user/hive/warehouse/ljh.db/city_rank';
4.mysql中查询导入结果,看结果是否正确