Background
- The company is a domestic manufacturer of bicycles and related accessories.
- By monitoring online and offline sales of commodities through dashboard, we can grasp the latest sales trends and regional distribution of commodities, for optimizing better business and achieving efficient operation.
Mind Map

Dashboard Exhibition

Data preprocessing
Before generate the dashboard, do processing to the data.
1.Import modules and data
import sqlalchemy
import pandas as pd
#coding:gbk
engine=sqlalchemy.create_engine("mysql+pymysql://<user>:<password>@<IPaddress>:<port>/<database>?charset=gbk")
sql_cmd='select * from <table>'
dw_customer_order=pd.read_sql_query(sql=sql_cmd,con=engine)
#new column year-month
gather_customer_order['create_year_month']=gather_customer_order['create_date'].apply(lambda x:x[:7])
#new column year-season
gather_customer_order['季度']=gather_customer_order['create_date'].apply(lambda x:(datetime.datetime.strptime(x,'%Y-%m-%d').month-1)//3+1).astype('str')+'季度'
gather_customer_order['year']=gather_customer_order['create_date'].apply(lambda x:(datetime.datetime.strptime(x,'%Y-%m-%d').year)).astype('str')+'年'
gather_customer_order['year_season']=gather_customer_order['year']+gather_customer_order['季度']
del gather_customer_order['季度'],gather_customer_order['year']

2.Sales Amount Comparison
2.1 Product Sales Amount Comparision Between Quarters
# 对产品大类和时间进行聚合
cplb_zw=gather_customer_order.groupby(['cplb_zw','year_season']).agg({'sum_amount':"sum"})
cplb_zw=cplb_zw.reset_index()
a=pd.Series([])
for i in cplb_zw['cplb_zw'].unique():
b=cplb_zw[cplb_zw['cplb_zw']==i]['sum_amount'].pct_change()
a=a.append(b)
cplb_zw['diff']=a
cplb_zw['diff']=cplb_zw['diff'].apply(lambda x:str(round(x,4)*100))
cplb_zw['diff']=cplb_zw['diff'].replace(0,'0.00')
cplb_zw['diff']=cplb_zw['diff'].apply(lambda x:x.split('.')[0]+'.'+x.split('.')[1][:2])
cplb_zw['diff']=cplb_zw['diff']+'%'
cplb_zw['diff']=cplb_zw['diff'].replace('0.00%','-')

# export data to database
save_engine=sqlalchemy.create_engine("mysql+pymysql://root:123@127.0.0.1:3306/mozi?charset=gbk")
cplb_zw.to_sql('cplb_zw_season',con=save_engine,if_exists='replace')
2.2 Product Sales Amount Comparision Between Months
import sqlalchemy
import pandas as pd
import numpy as np
#coding:gbk
engine=sqlalchemy.create_engine("mysql+pymysql://<user>:<password>@<IPaddress>:<port>/<database>?charset=gbk")
sql_cmd='select * from <table>'
dw_customer_order=pd.read_sql_query(sql=sql_cmd,con=engine)
dw_customer_order['create_year_month']=dw_customer_order['create_date'].apply(lambda x:x[:7])
dw_customer_order2=dw_customer_order.groupby('create_year_month').agg({'order_num':sum,'sum_amount':sum})
dw_customer_order2['order_num_diff']=dw_customer_order2['order_num'].diff()/dw_customer_order2['order_num']
dw_customer_order2['sum_amount_diff']=dw_customer_order2['sum_amount'].diff()/dw_customer_order2['sum_amount']
dw_customer_order2=dw_customer_order2.reset_index()
dw_customer_order2

#export data to database
save_engine=sqlalchemy.create_engine("mysql+pymysql://root:123@127.0.0.1:3306/bike_sales?charset=gbk")
dw_customer_order2.to_sql('dw_customer_order2',con=save_engine,if_exists='replace')
2.3 Proportion of bicycle sales quantity
Proportion of bicycle sales quantity
gather_customer_order=dw_customer_order
#new columns year-month
gather_customer_order['create_year_month']=gather_customer_order['create_date'].apply(lambda x:x[:7])
#analyze bike only
gather_customer_order=gather_customer_order[gather_customer_order['cplb_zw']=='自行车']
#groupby month
overall_sales_performance=gather_customer_order.groupby('create_year_month').agg({'order_num':sum,'sum_amount':sum})
#Cumulative sales of bicycles per month
gather_customer_order_group_month = overall_sales_performance.reset_index()
#Merge bicycle sales information tables
order_num_proportion = pd.merge(gather_customer_order,gather_customer_order_group_month,how='left',on='create_year_month')
order_num_proportion=order_num_proportion.drop(columns='sum_amount_y')
#Calculate bicycle sales/monthly bicycle sales ratio
order_num_proportion['order_proportion'] = order_num_proportion['order_num_x']/order_num_proportion['order_num_y']
#rename order_num_y as sum_month_order
order_num_proportion=order_num_proportion.rename(columns={'order_num_y':'sum_month_order'})

# export database
order_num_proportion.to_sql('pt_bicycle_product_sales_month',con=save_engine)
Road/mountain/tourist bicycle segment performance
2.3.1 Road bicycle
gather_customer_order_road=gather_customer_order[gather_customer_order['cpzl_zw']=='公路自行车']
#Sales quantity of different types of road bicycles
gather_customer_order_road_month = gather_customer_order_road.groupby(['create_year_month','product_name']).agg({'order_num':sum})
gather_customer_order_road_month=gather_customer_order_road_month.reset_index()
gather_customer_order_road_month=gather_customer_order_road_month[['create_year_month','product_name','order_num']]
#Cumulative sales of road bicycles every month
gather_customer_order_road_month_sum = gather_customer_order_road_month.groupby('create_year_month').agg({'order_num':sum})
gather_customer_order_road_month_sum=gather_customer_order_road_month_sum.reset_index()
#merge gather_customer_order_road_month and cumulative monthly sales
#Used to calculate the proportion of different models of products
gather_customer_order_road_month1 = pd.merge(gather_customer_order_road_month,gather_customer_order_road_month_sum,how='left',on='create_year_month')
gather_customer_order_road_month1

2.3.2 Mountain bicycle
#screening
gather_customer_order_mountain=gather_customer_order[gather_customer_order['cpzl_zw']=='山地自行车']
#Sales quantity of different types of mountain bikes
gather_customer_order_Mountain_month = gather_customer_order_mountain.groupby(['create_year_month','product_name']).agg({'order_num':sum})
gather_customer_order_Mountain_month=gather_customer_order_Mountain_month.reset_index()
#Cumulative sales of bicycles on mountain road every month
gather_customer_order_Mountain_month_sum = gather_customer_order_Mountain_month.groupby('create_year_month').agg({'order_num':sum})
#merge hz_customer_order_Mountain_month adn Cumulative sales of bicycles on mountain road every month
#Used to calculate the proportion of different models of products
gather_customer_order_Mountain_month = pd.merge(gather_customer_order_Mountain_month,gather_customer_order_Mountain_month_sum,how='left',on='create_year_month')
gather_customer_order_Mountain_month.head()

2.3.3 Tourist bicycle
gather_customer_order_tour = gather_customer_order[gather_customer_order['cpzl_zw'] == '旅游自行车']
gather_customer_order_tour_month=gather_customer_order_tour.groupby(['create_year_month','product_name']).agg({'order_num':sum})
gather_customer_order_tour_month=gather_customer_order_tour_month.reset_index()
gather_customer_order_tour_month_sum=gather_customer_order_tour.groupby('create_year_month').agg({"order_num":sum})
gather_customer_order_tour_month_sum=gather_customer_order_tour_month_sum.reset_index()
gather_customer_order_tour_month=pd.merge(gather_customer_order_tour_month,gather_customer_order_tour_month_sum,how='left',on='create_year_month')
gather_customer_order_tour_month.head()

Merge three types of bicycle data
gather_customer_order_month=pd.concat([gather_customer_order_Mountain_month,gather_customer_order_road_month1,gather_customer_order_tour_month],axis=0)
gather_customer_order_month['order_num_proportion']=gather_customer_order_month['order_num_x']/gather_customer_order_month['order_num_y']
gather_customer_order_month.rename(columns={'order_num_x':'order_month_product','order_num_y':'sum_order_month'},inplace=True)
gather_customer_order_month.head()

2.4 Sales amount in February and March of 2021
gather_customer_order_month_2_3=gather_customer_order_month[gather_customer_order_month.create_year_month.isin(['2021-02','2021-03'])]
product_name=gather_customer_order_month_2_3['product_name'].unique().tolist()
order_top_x=pd.Series([])
for i in product_name:
a=gather_customer_order_month_2_3[gather_customer_order_month_2_3['product_name']==i]['order_month_product'].diff()/gather_customer_order_month_2_3[gather_customer_order_month_2_3['product_name']==i]['order_month_product']
order_top_x=order_top_x.append(a)
order_top_x=order_top_x.fillna(0)
gather_customer_order_month_2_3['order_num_diff']=order_top_x
3.Analysis of User Behaviors
#Read the database customer information table
engine=sqlalchemy.create_engine("mysql+pymysql://<user>:<password>@<IPaddress>:<port>/<database>?charset=gbk")
df_CUSTOMER = pd.read_sql("select customer_key,birth_date,gender,marital_status from ods_customer where create_date < '2021-02-28'",con = engine)
df_sales_orders_2 = pd.read_sql_query("select * from ods_sales_orders",con = engine)
df_sales_orders_2['create_year_month']=df_sales_orders_2['create_date'].astype('str').apply(lambda x:x[:7])
df_sales_orders_2.groupby('create_year_month').agg({'create_year_month':'count',
'unit_price':sum})
df_sales_customer=pd.merge(df_sales_orders_2,df_CUSTOMER,on='customer_key',how='left')
df_sales_customer_drop=df_sales_customer.dropna(axis=0)
df_sales_customer_drop['birth_year']=df_sales_customer_drop['birth_date'].apply(lambda x:x.split('-')[0])
df_sales_customer_drop['birth_year']=2021-df_sales_customer_drop['birth_year'].astype('int')
df_sales_customer_drop.rename(columns={'birth_year':'age'},inplace=True)
age_labels=["30-34","35-39","40-44","45-49","50-54","55-59","60-64"]
age_bins=list(np.arange(30,66,5))
df_sales_customer_drop['age_group']=pd.cut(df_sales_customer_drop['age'],labels=age_labels,bins=age_bins)
age_labels_2=['<=29','30-39','>=40']
age_list_2=[0,29,40,100]
df_sales_customer_drop['age_level_2']=pd.cut(df_sales_customer_drop['age'],labels=age_labels_2,bins=age_list_2)
3.1 Proportion of User Gender
gender_count=df_sales_customer_drop.groupby('gender').agg({'cpzl_zw':'count'}).reset_index()
age_level_2_count=df_sales_customer_drop.groupby('age_level_2').agg({'cpzl_zw':'count'}).reset_index()
df_sales_customer_drop=pd.merge(df_sales_customer_drop,age_level_2_count,on='age_level_2')
df_sales_customer_drop.rename(columns = {'cpzl_zw_y':'age_level2_count'},inplace=True)
df_sales_customer_drop['gender_rate'] = 1/df_sales_customer_drop['gender_count']
df_sales_customer_drop['gender']=df_sales_customer_drop['gender'].replace('F','女性')
df_sales_customer_drop['gender']=df_sales_customer_drop['gender'].replace('M','男性')
df_sales_customer_drop['marital_status']=df_sales_customer_drop['marital_status'].replace('M','已婚')
df_sales_customer_drop['marital_status']=df_sales_customer_drop['marital_status'].replace('S','单身')

# export to database
df_sales_customer_drop.to_sql('df_sales_customer_dropna',con=save_engine,if_exists='replace')
3.2 User Behavior
3.2.1 Analysis of user consumption trends (monthly)
a.sales amount in every month
b.orders amount in every month
c.sales quantity in every month
d.the number of consumers in every month
df_sales_customer_drop_month=df_sales_customer_drop.groupby('create_year_month').agg({'create_year_month':'count',
'unit_price':sum,
'customer_key':'nunique'})
df_sales_customer_drop_month.rename(columns = {'unit_price':'消费金额',
'customer_key': '客户数量',
'create_year_month': '产品购买量'}, inplace=True)
df_sales_customer_drop_month=df_sales_customer_drop_month.reset_index().rename(columns={'create_year_month':'年月'})
df_sales_customer_drop_month['销售平均单价']=df_sales_customer_drop_month['消费金额']/df_sales_customer_drop_month['产品购买量']
df_sales_customer_drop_month['平均客单价']=df_sales_customer_drop_month['消费金额']/df_sales_customer_drop_month['客户数量']
#导出数据库
df_sales_customer_drop_month.to_sql('df_sales_customer_drop_month',con=save_engine,if_exists='replace')
3.2.2 Analysis of individual consumption of users
a.Description statistics of consumption amount and consumption frequency
b.Distribution of users' consumption
c.Proportion of accumulated consumption amount of users
df_sales_customer_drop_client=df_sales_customer_drop.groupby('customer_key').agg({'customer_key':'count',
'unit_price':sum})
df_sales_customer_drop_client=df_sales_customer_drop_client.rename(columns={'customer_key':'购买数量',
'unit_price':'消费金额'}).reset_index()
df_sales_customer_drop_client=df_sales_customer_drop_client.rename(columns={'customer_key':'客户id'})
list=[i for i in range(0,int(df_sales_customer_drop_client['消费金额'].max())+1000,500)]
df_sales_customer_drop_client['消费金额分组']=pd.cut(df_sales_customer_drop_client['消费金额'],bins=list,labels=list[1:])

#export to database
df_sales_customer_drop_client.to_sql('df_sales_customer_drop_client',con=save_engine,if_exists='replace')
Accumulated consumption
df_sales_customer_drop_cumsum=round(df_sales_customer_drop.groupby('customer_key').sum().sort_values('unit_price').cumsum()/df_sales_customer_drop.groupby('customer_key').sum().sort_values('unit_price').sum(),6)
df_sales_customer_drop_cumsum=df_sales_customer_drop_cumsum.reset_index().rename(columns={'unit_price':'消费金额百分比',
'age':'订单量百分比',
'customer_key':'客户ID'}).iloc[:,:3]
df_sales_customer_drop_cumsum

#export to database
df_sales_customer_drop_cumsum.to_sql('df_sales_customer_drop_cumsum',con=save_engine,if_exists='replace')
3.2.3 User consumption behavior
a.First purchase
b.Last purchase
c.RFM model
d.Quantity of new, old, active, backflow and loss
e.User consumption cycle distribution
f.User lift cycle
a.First purchase
df_sales_customer_firstdate=df_sales_customer_drop.groupby('customer_key').min().create_date.value_counts().reset_index().rename(columns={'index':'first_date'})
#export to database
df_sales_customer_firstdate.to_sql('df_sales_customer_firstdate',con=save_engine,if_exists='replace')
df_sales_customer_firstdate

b.Last purchase
df_sales_customer_lastdate=df_sales_customer_drop.groupby('customer_key').max().create_date.value_counts().reset_index().rename(columns={'index':'last_date'})
#export to database
df_sales_customer_lastdate.to_sql('df_sales_customer_lastdate',con=save_engine,if_exists='replace')
df_sales_customer_lastdate

df_sales_customer_user_life=df_sales_customer_drop.groupby('customer_key').create_date.agg(['min','max'])
buy_onlyone=pd.DataFrame((df_sales_customer_user_life['min']==df_sales_customer_user_life['max']).value_counts()).rename(columns={0:'buy_onlyone'})
c.RFM model
df_sales_customer_new=df_sales_customer_drop.groupby(['create_year_month','customer_key']).create_date.agg(['min','max'])
df_sales_customer_new['new']=(df_sales_customer_new["min"] == df_sales_customer_new["max"] )
# divide user group
# pivot_table
rfm=df_sales_customer_drop.pivot_table(index='customer_key',
values=['product_key','unit_price','create_date'],
aggfunc={'create_date':'max',
'product_key':'count',
'unit_price':'sum'})
rfm['R']= -(rfm.create_date - rfm.create_date.max())/np.timedelta64(1,'D')
# R :Recency F:Frequency M:Monetary
rfm.rename(columns={'product_key':"F",'unit_price':'M'},inplace=True)
rfm

def rfm_func(x):
level=x.apply(lambda x:'1' if x>=0 else '0')
# print(type(level))
# print(level.index)
label=level.R + level.F + level.M
d={
# R equels 1, indicating that it is larger than the mean and closer to the earliest time;
# F equels 1, indicating that the consumption amount is relatively large;
# M equels 1, indicating that the consumption frequency is relatively large, so it is an important value customer
'111':'重要价值客户',
'011':'重要保持客户',
'101':'重要发展客户',
'001':'重要挽留客户',
'110':'一般价值客户',
'010':'一般保持客户',
'100':'一般发展客户',
'000':'一般挽留客户',
}
result=d[label]
return result
rfm['label']=rfm[['R','F','M']].apply(lambda x:x-x.mean()).apply(rfm_func,axis=1)
rfm.loc[rfm.label=='重要价值客户','color']='重要价值客户'
rfm.loc[~(rfm.label=='重要价值客户'),'color']='非重要价值客户'
rfm=rfm.reset_index()
rfm

#export to database
rfm.to_sql('rfm',con=save_engine,if_exists='replace')
d.Quantity of new, old, active, backflow and loss
# pivot_table
pivoted_counts=df_sales_customer_drop.pivot_table(index='customer_key',
columns='create_year_month',
values='create_date',
aggfunc='count').fillna(0)
# if comsume,equals 1,otherwise 0
df_purchase=pivoted_counts.applymap(lambda x:1 if x>0 else 0)
# Here, due to pivottability, null values of 0 are filled in, and it is possible that the user did not register at all during the month,
#This will mislead the statistics of the first consumption data, so write a function to handle it
def active_status(data):
status=[]
# There are altogether 15 months of data, and one line of data is input each time for monthly judgment
for i in range(15):
# If there is no consumption this month, the result processed above
if data[i]==0:
if len(status)>0:
if status[i-1]=='unreg':
status.append('unreg')
else:
status.append('unactive')
else:
# If there is no previous data, it is considered unregistered
status.append('unreg')
#If consume this month
else:
if len(status)==0:
status.append('new')
else:
if status[i-1]=='unactive':
status.append('return')
elif status[i-1]=='unreg':
status.append('new')
else:
status.append('active')
return status
If there is no consumption in this month, here is only the last month to judge whether the registration, there is a defect, you can judge whether there is
- If previously unregistered, it is still unregistered
- Loss/inactivity if there was previous consumption
- In other cases, unregistered
purchase_stats=df_purchase.apply(lambda x: pd.Series(active_status(x),index=df_purchase.columns),axis=1)
# This replaces unregistered values with null values so that 'count' is not counted
# The monthly user distribution
purchase_stats_ct=purchase_stats.replace('unreg',np.NaN).apply(lambda x:pd.value_counts(x))
returnratee=purchase_stats_ct.apply(lambda x:x/x.sum(),axis=0)
purchase_stats_ct_info = purchase_stats_ct.fillna(0).T
purchase_stats_ct_info=purchase_stats_ct_info.reset_index()
purchase_stats_ct_info

e.User consumption cycle distribution
# User purchase cycle (by order)
# Calculate the time interval between two adjacent orders, shift function is to shift the data, all the data will be shifted down a bit, so it is ok
order_diff=df_sales_customer_drop.groupby('customer_key').apply(lambda x:x.create_date-x.create_date.shift())
order_diff1=order_diff.fillna(10)
for i in range(0,len(order_diff1),1):
if type(order_diff1[i])==int:
order_diff1[i]=float(order_diff1[i])
for i in range(0,len(order_diff1),1):
if type(order_diff1[i])!=float:
order_diff1[i]=float(order_diff1[i].days)
order_diff_cut_lst = [i for i in range(0,int(order_diff1.max())+1,10)]
order_diff_info_hist = pd.cut(order_diff1,bins=order_diff_cut_lst,labels=order_diff_cut_lst[1:])
order_diff_info_hist=pd.DataFrame(order_diff_info_hist)
order_diff_info_hist

# export to database
order_diff_info_hist.to_sql('order_diff_info_hist',con=save_engine,if_exists='replace')
f.User lift cycle
- Order cycles are exponentially distributed
- The average purchase cycle is 68 days
- The majority of users have a purchase cycle of less than 100 days
user_life=df_sales_customer_user_life
user_life_info = ((user_life['max']-user_life['min'])/np.timedelta64(1,"D"))
user_life_lst = [i for i in range(0,int(user_life_info.max())+1,10)]
user_life_info_hist = pd.cut(user_life_info,bins=user_life_lst,labels=user_life_lst[1:])
user_life_info_hist_2 = user_life_info_hist.fillna(10)
user_life_info_hist_2=pd.DataFrame(user_life_info_hist_2)
user_life_info_hist_2=user_life_info_hist_2.reset_index()
user_life_info_hist_2

# export to database
user_life_info_hist_2.to_sql('user_life_info_hist_2',con=save_engine,if_exists='replace')
- The user's life cycle is heavily influenced by users who only buy once (can be excluded)
- The average user spends 134 days, and the median is only 0 days
user_life["差值"]=(user_life["max"] - user_life["min"])
user_life=user_life.reset_index()
user_life.head()

#export to database
user_life.to_sql('user_life',con=save_engine,if_exists='replace')
3.2.4 Repurchase in one month proportion and repurchase proportion analysis
a.Repurchase in one month proportion
-purchase more than one time in one month
b.Repurchase proportion
-purchase more than one time in whole time
# Distinguish one from more than one in order to calculate the repurchase rate, greater than 1 is 1, equal to 0 is 0, equal to 1 is 0
purchase_r=pivoted_counts.applymap(lambda x: 1 if x>1 else np.NaN if x==0 else 0)
purchase_r_reshop = (purchase_r.sum()/purchase_r.count()).reset_index(name = 'reshop')
purchase_r_reshop['create_year_month'] = purchase_r_reshop['create_year_month'].astype(str)
purchase_r_reshop

The repurchase in one month proportion remained stable at 20%. In the first five months, due to the influx of a large number of new users who only purchased once, the repurchase in one month proportion decreased dramatically
# You need to use a function to determine whether a repurchase is possible: users who spend money in one month also spend money in the next month are called repurchase, and this definition can be changed
def purchase_back(data):
'''To determine whether each month is a repurchase, based on whether the purchase was made last month, the purchase made last month is not a repurchase if the purchase is not made next month'''
status=[]
for i in range(14):
if data[i]==1:
if data[i+1]==1:
status.append(1)
if data[i+1]==0:
status.append(0)
else:
status.append(np.NaN)
# The 15th month filled NaN
status.append(np.NaN)
return status
indexs = df_purchase.columns
indexs=df_sales_customer_drop['create_year_month'].sort_values().astype('str').unique()
purchase_b = df_purchase.apply(lambda x :pd.Series(purchase_back(x),index = indexs),axis =1)
purchase_b_backshop = purchase_b.sum()/purchase_b.count()
purchase_b_backshop.index = purchase_b_backshop.index.astype(str)
purchase_b_backshop=pd.DataFrame(purchase_b_backshop)
purchase_b_backshop

#export to database
purchase_r_reshop.to_sql('purchase_r_reshop',con=save_engine,if_exists='replace')
4.Sales analysis of hot products in March, 2021
gather_customer_order_3=gather_customer_order[gather_customer_order['create_year_month']=='2021-03']
TOP 10 product item and sales quantity comparision between Feb and Mar
customer_order_3_top10=gather_customer_order_3.groupby('product_name').agg({'order_num':'sum'}).reset_index()\
.sort_values('order_num',ascending=False)
customer_order_3_top10=customer_order_3_top10.head(10)
customer_order_3_top10

customer_order_month_2_3 = gather_customer_order_month_2_3[['create_year_month','product_name','order_month_product','cpzl_zw','order_num_diff']]
customer_order_month_2_3['cpzl_zw']=customer_order_month_2_3['cpzl_zw'].fillna('公路自行车')
customer_order_month_2_3=customer_order_month_2_3[customer_order_month_2_3['product_name'].isin\
(customer_order_3_top10['product_name']).tolist()]
customer_order_month_2_3['category']='本月TOP10销量'
#Axis = 0 merges by row dimension, axis = 1 merges by column dimension
hot_products_3 = pd.concat([customer_order_month_2_3,customer_order_month_3_top10_seep],axis = 0)

