盯着上面的这个图,看着看着就想到: 使用前7天付费的数据,分两步,先分类出谁会继续付款,然后回归分析继续给钱的玩家会付款多少?还有第三部分,就是忽略掉前7天没付款,但是后面45天会付款的,然后就变成以下的图了。
一、分类
1、准备数据
先对数据进行一下预处理,主要是对object类的feature处理掉,没有什么归一化之类的处理的。
data = pd.read_csv("tap_fun_train.csv", parse_dates=True)
#提取object及其对应的数据
object_columns_df = data.select_dtypes(include=["object"])
#object罗列出来了。
print(object_columns_df.iloc[1])
# 有一个object特征:register_time,处理掉
data['register_time_month'] = data.register_time.str[5:7]
data['register_time_day'] = data.register_time.str[8:10]
data = data.drop(['register_time'],axis=1)
# object转换float
data[['register_time_month','register_time_day']] = data[['register_time_month','register_time_day']].apply(pd.to_numeric)
# data=pd.DataFrame(data,dtype=np.float)
# 对于注册时间,拆分开月和日之后,再合并一个数值,更好反馈时间的前后
data['register_time_count'] = data['register_time_month'] * 31 + data['register_time_day']
data.shape
# (2288007, 111)
#保存前7天会给钱的客户
data_7_pay = copy.copy(data[data['pay_price']>0])
data_7_pay.shape
# (41439, 111)
data_7_pay.to_csv ("tap_fun_train_7_pay.csv")
打标签label,删掉不该有的特征等。
# -------------------------读取train set中前7天有付款的玩家明细
data = pd.read_csv("tap_fun_train_7_pay.csv", index_col=0, parse_dates=True)
print(data.shape) #(41439, 111)
# -------------------------打标签
data['7_45_same_pay_label'] = (data['pay_price'] == data['prediction_pay_price'])
data['7_45_same_pay_label']=data['7_45_same_pay_label'].map({True:1,False:0})
data['7_45_same_pay_label'].value_counts()
# 1 30130 前7天给,后面都不付款了
# 0 11309 前7天给了,后续还给的。
# 删掉不需要的字段,比如45天的付费金额(对于test set是没有的字段),user_id
data = data.drop(['prediction_pay_price', 'user_id'],axis=1)
data.shape
2、训练分类模型
分拆数据,训练和验证部分。
from sklearn.model_selection import train_test_split
label = '7_45_same_pay_label'
# 将X和Y拆分开
X = data.loc[:, data.columns != label]
y = data.loc[:, data.columns == label]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.22, random_state = 0)
print("100% data")
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
# print(y_train.info())
# train和test拆分后,把train部分重新组合成data_train,也就是把test部分完整保留下来,除了test用,不参加任何处理了。
data_train = pd.concat( [X_train, y_train], axis=1 )
print(data.shape)
print('---------------------------------------')
print(data_train.shape)
# show_class(data_train,label) 自己写的函数,屏蔽了吧
#创建一个dataframe,然后对模型的效果进行记录。最后小结。
thresholds = [0.1,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.6,0.65,0.7,0.75,0.8,0.85,0.9]
thresholds_2 = thresholds[:] #= thresholds,如果这样复杂是,浅复制,映射同一块内存
thresholds_2.append('time')
print(thresholds_2)
result_model_f1 = pd.DataFrame(index=thresholds_2)
print(result_model_f1)
训练,用GradientBoostingClassifier训练试试先,然后在留下的22%数据中,进行验证,f1有87.55%,挺不错的了。
import time
start = time.time()
print('start time:',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
from sklearn.ensemble import GradientBoostingClassifier
gradient_boosting_classifier = GradientBoostingClassifier()
gradient_boosting_classifier.fit(X_train,y_train.values.ravel())
y_pred = gradient_boosting_classifier.predict(X_test.values)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(y_test,y_pred,title='Confusion matrix')
end = time.time()
print(end-start,'s')
试试用不同的threshold value来划分,看看有没更优的threshold value,0.45会好一点点,87.69%,比0.5,高0.14%,小幅度提升。
#2、----------------------------------预测结果----------------------------------
y_pred_proba = gradient_boosting_classifier.predict_proba(X_test.values) #array of shape = [n_samples, n_classes], or a list of n_outputs
#3、----------------------------------记录各种threshold下的结果----------------------------------
result_model_f1['GradientBoostingClassifier'] = 0 #扩充列,全为0.
print(result_model_f1)
for i in thresholds:
y_test_predictions_high_recall = y_pred_proba[:,1] > i
print('Threshold >= %s'%i)
print_recall_precision_f1(y_test,y_test_predictions_high_recall)
print("------------------------------------")
for i in thresholds:
y_test_predictions_high_recall = y_pred_proba[:,1] > i
plt.figure(figsize=(4,4))
plot_confusion_matrix(y_test,y_test_predictions_high_recall, title='Threshold >= %s'%i)
result_model_f1.loc[i,'GradientBoostingClassifier'] = f1_score(y_test.values,y_test_predictions_high_recall) #记录f1
result_model_f1.loc['time','GradientBoostingClassifier'] = end-start #记录时间
print(result_model_f1)
备注下:
用xgboost也试过,默认参数下,差一点点。
导出模型
from sklearn.externals import joblib
print(gradient_boosting_classifier)
joblib.dump(gradient_boosting_classifier, 'gradient_boosting_classifier.model')
二、回归
1、准备数据
data = pd.read_csv("tap_fun_train_7_pay.csv", index_col=0, parse_dates=True)
data_pay_more = copy.copy(data[data['pay_price']<data['prediction_pay_price']])
data_pay_more.shape
# (11309, 111) 前7天付款,且后来继续付款的有1.1万人。用来做回归。
#删掉user_id
data_pay_more = data_pay_more.drop([ 'user_id'],axis=1)
data = copy.copy(data_pay_more)
data.shape
2、训练模型
分拆数据,训练与验证部分
from sklearn.model_selection import train_test_split
Quantity = 'prediction_pay_price'
# 将X和Y拆分开
X = data.loc[:, data.columns =='pay_price']
# X = data.loc[:, data.columns != Quantity]
y = data.loc[:, data.columns == Quantity]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.22, random_state = 0)
print("100% data")
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
来,训练吧,还是GradientBoosting,但是是Regressor版,GradientBoostingRegressor。
import time
start = time.time()
print('start time:',time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
gradient_boosting_regression = GradientBoostingRegressor()
gradient_boosting_regression.fit(X_train,y_train.values.ravel())
y_pred = gradient_boosting_regression.predict(X_test.values)
# The mean squared error
print("Root Mean squared error: %.2f"
% mean_squared_error(y_test, y_pred) ** 0.5)
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
end = time.time()
print(end-start,'s')
其实Root Mean Squared error,都去到851了,爆表啊,先不管,上系统看看先。
三、用模型预测,上系统看看排名
1、把test set处理得跟train set训练前的样子,要不模型不认啊
data = pd.read_csv("tap_fun_test.csv", parse_dates=True)
data.shape #(828934, 108)
#提取object及其对应的数据
object_columns_df = data.select_dtypes(include=["object"])
#object罗列出来了。
print(object_columns_df.iloc[1])
# 有一个object的feature,register_time ,处理掉
data['register_time_month'] = data.register_time.str[5:7]
data['register_time_day'] = data.register_time.str[8:10]
data = data.drop(['register_time'],axis=1)
# object转换float
data[['register_time_month','register_time_day']] = data[['register_time_month','register_time_day']].apply(pd.to_numeric)
# 对于注册时间,拆分开月和日之后,再合并一个数值,更好反馈时间的前后
data['register_time_count'] = data['register_time_month'] * 31 + data['register_time_day']
data.shape
#(828934, 110)
# 保存前7天会给钱的客户
data_7_pay = copy.copy(data[data['pay_price']>0])
data_7_pay.shape # (19549, 110)
data_7_pay.to_csv ("tap_fun_test_7_pay.csv")
# 保存前7天没有给钱的客户
data_7_NOT_pay = copy.copy(data[data['pay_price']==0])
data_7_NOT_pay.shape #(809385, 110)
data_7_NOT_pay.to_csv ("tap_fun_test_7_NOT_pay.csv")
2、把test set里面前7天没付款的直接预测后续不会给钱。
data_test = pd.read_csv("tap_fun_test_7_NOT_pay.csv", index_col=0, parse_dates=True)
print(data_test.shape)# (809385, 110) , 80万的客户
# 把user_id和前7天的0元pay_price提炼出来。
data_test_part1 = data_test[['user_id','pay_price']]
data_test_part1.rename(columns={'pay_price':'prediction_pay_price'}, inplace = True)
data_test_part1.to_csv('tap_fun_test_part1_still0.csv')
3、在前7天有付费的客户中分类出后续不再付费的客户,并保存
data_test = pd.read_csv("tap_fun_test_7_pay.csv", index_col=0, parse_dates=True)
print(data_test.shape) # (19549, 110)
data_test_model = data_test.drop([ 'user_id'],axis=1),# 跑模型前要删掉user_id
#跑分类模型
y_test_pred = gradient_boosting_classifier.predict(data_test_model.values)
# 结果转ndarray换成dataframe
y_test_pred = pd.DataFrame(y_test_pred, columns= {'pred_label'})
# 为了把index清零,重头开始,先转成ndarray,再转回来dataframe
columns_test = data_test.columns
data_test = data_test.values
data_test = pd.DataFrame(data_test, columns = columns_test )
# 重新把预测结果放回去原始数据
y_test_pred = pd.concat([data_test, y_test_pred], axis=1)
y_test_pred.shape # (19549, 111)
y_test_pred['pred_label'].value_counts()
# 1 15587 不会再给钱的有这么多
# 0 3962 会继续给钱的。
# part2:把1的留出来,并预测为原来的值。
# part3:把0的放到回归里面去猜。
y_test_pred_part2 = copy.copy(y_test_pred[y_test_pred['pred_label']==1])
y_test_pred_part3 = copy.copy(y_test_pred[y_test_pred['pred_label']==0])
y_test_pred_part2_user_id = pd.DataFrame(y_test_pred_part2,columns ={'user_id'})
y_test_pred_part2_pay = pd.DataFrame(y_test_pred_part2,columns ={'pay_price'})
y_test_pred_part2 = pd.concat([y_test_pred_part2_user_id, y_test_pred_part2_pay], axis=1)
y_test_pred_part2.rename(columns={'pay_price':'prediction_pay_price'}, inplace = True)
y_test_pred_part2.to_csv('tap_fun_test_part2_nopaymore.csv')
y_test_pred_part3.to_csv('tap_fun_test_part3_paymore.csv')
4、前7天付费的,【分类】预判会继续付费的,【回归】出付费金额
y_test_pred_part3 = pd.read_csv("tap_fun_test_part3_paymore.csv", index_col=0, parse_dates=True)
y_test_pred_part3.shape #(3962, 111)
# 把user_id保留出来。
user_id_pay_more = y_test_pred_part3['user_id'].values
user_id_pay_more[:10]
# 提出需要的字段,也就是pay_price
y_test_pred_part3_test = pd.DataFrame(y_test_pred_part3,columns=['pay_price'])
y_test_pred_part3_test.shape
# 跑模型
y_test_pred_part3_howmuch = gradient_boosting_regression.predict(y_test_pred_part3_test.values)
# 把预测结果和user_id合并回去。
y_test_pred_part3_user_id = pd.DataFrame(user_id_pay_more,columns = {'user_id'})
y_test_pred_part3_howmuch = pd.DataFrame(y_test_pred_part3_howmuch,columns = {'prediction_pay_price'})
y_test_pred_part3 = pd.concat([y_test_pred_part3_user_id, y_test_pred_part3_howmuch], axis=1)
y_test_pred_part3.shape # (3962, 2)
y_test_pred_part3.to_csv('tap_fun_test_part3_paymore_result.csv')
5、重新合并三块的结果
pred_part1 = pd.read_csv("tap_fun_test_part1_still0.csv", index_col=0, parse_dates=True)
print(pred_part1.shape)
pred_part2 = pd.read_csv("tap_fun_test_part2_nopaymore.csv", index_col=0, parse_dates=True)
print(pred_part2.shape)
pred_part3 = pd.read_csv("tap_fun_test_part3_paymore_result.csv", index_col=0, parse_dates=True)
print(pred_part3.shape)
# (809385, 2)
# (15587, 2)
# (3962, 2)
pred = pred_part1.append(pred_part2)
pred = pred.append(pred_part3)
pred.shape # (828934, 2)
pred.to_csv('result.csv')
幸好最后的数量跟test set的客户规模对得上。
6、上系统看分数
有个小插曲,打开文件,把第一列index删掉,然后保存为逗号作为分割的csv文件。
改用RandomForestClassifier+GradientBoostingregression,基本没变化。
改用RandomForestClassifier和LinearRegression,效果突然降低了10。
四、结果分析
从上面三次结果看,回归部分才是提升的关键所在。
sklearn里面有哪些regressor,点这里。
改用RandomForestClassifier和huber_regressor
迟点再试试其他的regressor吧。
五、优化方向
1、后续试试对三个部分的RMES看看。
2、归回部分、被忽略的后来付费用户部分。