现有用户及用户订单的文件数据
一、将数据导入数据库
二、分析用户消费行为
- 统计不同月份的下单人数
- 统计三月份用户的回购率和复购率
- 统计男女的消费频次是否有差异
- 统计多次消费的用户,第一次和最后一次消费时间的间隔
- 统计不同年龄段的用户消费金额是否有差异
- 统计消费的top20%用户,贡献了多少额度
一、将数据导入数据库
1、建表
- 订单明细表
CREATE TABLE ORDERINFO (
ORDERID varchar(10) NOT NULL,--订单ID,主键
USERID varchar(10) NULL,--用户ID,可以和用户表进行关联
ISPAID varchar(10) NULL,--是否支付
PRINCE varchar(10) NULL,--订单价格
PAIDTIME varchar(16) NULL, --订单支付时间
PRIMARY KEY(ORDERID) -- 主键
)ENGINE = INNODB;
- 用户信息表
CREATE TABLE USERINFO (
USERID varchar(10) NOT NULL,--用户ID,主键
SEX varchar(10) NULL,--性别
BIRTH varchar(10) NULL, --出生日期
PRIMARY KEY(USERID)
)ENGINE = INNODB;
2、处理数据(空值、异常值)
- pandas读取数据
import pandas as pd
import os
os.chdir(r'D:\anaconda\workplace\Data_Analysis\Data')
userinfo = pd.read_csv('user_info_utf.csv') #用户信息数据
orderinfo = pd.read_csv('order_info_utf.csv') #订单数据
存在如下异常值
-
处理空值及异常数据
# 将读取的数据赋值给userdata
userdata = userinfo
# 去除空值
userdata.dropna(axis=0,inplace=True)
# 如果出生年份不是以'19'或者'20'开头,则去除
for i in userdata['BIRTH']:
if not i[:4].startswith(('19','20')):
userdata.drop(userdata[userdata['BIRTH']==i].index,axis=0,inplace=True)
- 处理好异常数据后,在将数据导入数据库
#导入相关库
from sqlalchemy import create_engine
#创建连接
engine = create_engine("mysql+pymysql://root:root@127.0.0.1:3306/learn?charset=gbk")
#连接数据库并导入数据
userdata.to_sql('userinfo',con=engine,index=False,if_exists='append')
可用相同方法处理并导入订单数据
二、分析用户消费行为
- 统计不同月份的下单人数
select CONCAT(substring(paidtime,1,4),'-',REPLACE(substring(paidtime,6,2),'/','')) MONTH,count(distinct USERid) XDRS from ORDERINFO
where ISPAID = '已支付'
group by substring(paidtime,1,4),substring(paidtime,6,2);
结果:
- 统计三月份用户的回购率和复购率
复购率
-- 复购率是在本月消费中多少人消费一次以上的占比
SELECT COUNT(tt.USERID) AS '三月购买的总用户数',count(tt.USERID_count) AS '多次购买用户数' FROM
(SELECT
t.USERID AS USERID,
case when t.USERID_count>1 then 1 else null end USERID_count
from
(select USERID,count(USERID) AS USERID_count from ORDERINFO
where ISPAID = '已支付' and REPLACE(SUBSTRING(paidtime,6,2),'/','') = '3'
group by USERID ) t
) tt;
结果:
回购率:
-- 回购率是三月份购买的人四月份依旧购买
select substring(a.paidtime,1,7) '年月',count(distinct a.USERID) '本月消费的用户数量',count(distinct b.USERID)'本月回购的用户数'
from(select userid,paidtime from ORDERINFO where ISPAID = '已支付'
group by userid,paidtime) a
left join (select userid,paidtime from ORDERINFO
where ISPAID = '已支付'
group by userid,paidtime) b
on a.userid = b.userid and substring(a.paidtime,6,2) = substring(b.paidtime,6,2)-1
group by substring(a.paidtime,1,7)
结果:
- 统计男女的消费频次是否有差异
select aa.sex '性别',round(avg(aa.PC_ORDERID),2) '消费频次' from
(select a.USERID,b.sex,count(a.ORDERID) PC_ORDERID from ORDERINFO a
inner join (select * from USERINFO where SEX is not null) b on a.USERID = B.USERID
group by a.USERID,b.sex) aa
group by aa.sex;
结果:
- 统计多次消费的用户,第一次和最后一次消费时间的间隔
select USERID,ORDERID_CS,paidtime_max,paidtime_min,
datediff(paidtime_max,paidtime_min) AS '第一次最后一次消费时间差'
FROM
(SELECT USERID,count(ORDERID) ORDERID_CS,max(paidtime) paidtime_max,min(paidtime) paidtime_min from ORDERINFO
WHERE ISPAID = '已支付'
GROUP BY USERID HAVING count(ORDERID)>1 ORDER BY orderid_cs DESC) aa;
结果:
- 统计不同年龄段的用户消费金额是否有差异
select aa.NLD,round(avg(aa.PRINCE),2) avg_prince from
(select a.ORDERID,A.USERID,cast(A.PRINCE as float) PRINCE,
case
when B.AGE between 10 and 19 then '10-19岁'
when B.AGE between 20 and 29 then '20-29岁'
when B.AGE between 30 and 39 then '30-39岁'
when B.AGE between 40 and 49 then '40-49岁'
when B.AGE between 50 and 59 then '50-59岁'
when B.AGE between 60 and 69 then '60-69岁'
when B.AGE between 70 and 79 then '70-79岁'
else null
end NLD,
B.AGE from (
select * from ORDERINFO where ISPAID = '已支付') a
inner join (
select USERID,year(NOW())-left(BIRTH,4) age from USERINFO
where BIRTH is not null) b on a.USERID = b.USERID
)aa
group by aa.NLD
having aa.NLD is not null
结果:
- 统计消费的top20%用户,贡献了多少额度
select sum(sum_prince) from -- top20%用户贡献的总额度
(select *,row_number() over(order by sum_prince desc) as '排序' from
(select
userid,round(sum(cast(prince as float)),2) sum_prince
from ORDERINFO
where ISPAID = '已支付' group by USERID) aa) tt
where
'排序' < (select count(distinct USERID)*0.2 from ORDERINFO where ISPAID = '已支付')
结果: