现有用户一些列信息,希望为每一位用户打分,分数越高越不容易发生违约
读入数据
import pandas as pd
from sklearn.metrics import roc_auc_score,roc_curve,auc
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import numpy as np
import random
import math
data = pd.read_csv('Bcard.txt')
data.head()
#info结尾的是自己做的无监督系统输出的个人表现,score结尾的是收费的外部征信数据
其中bad_ind是用户是否违约
# data.shape# (95806, 13)
# data.info()# 发现除了uid和obs_mth是object其他都为float,且各项数据均无缺失值
data['obs_mth'].value_counts()
#Out:#月份分布
# 2018-07-31 34030
# 2018-06-30 20565
# 2018-11-30 15975
# 2018-10-31 14527
# 2018-09-30 10709
# Name: obs_mth, dtype: int64
看个特征与目标bad_ind之间的斯皮尔曼相关系数
from scipy.stats import spearmanr
for x in data.columns[3:]:
print(x,':',spearmanr(data[['bad_ind',x]]))
#Out:
# td_score : SpearmanrResult(correlation=0.0005868898942345399, pvalue=0.8558534829789535)
# jxl_score : SpearmanrResult(correlation=-0.0009783556270478676, pvalue=0.7620254552161709)
# mj_score : SpearmanrResult(correlation=-0.0012710218588865194, pvalue=0.694017696571299)
# rh_score : SpearmanrResult(correlation=0.0024440034641950155, pvalue=0.4493659451941101)
# zzc_score : SpearmanrResult(correlation=-0.00382429020117564, pvalue=0.23653025372944442)
# zcx_score : SpearmanrResult(correlation=0.007486507503862885, pvalue=0.020489203237985445)
# person_info : SpearmanrResult(correlation=0.0739759050852215, pvalue=2.4126233621356753e-116)
# finance_info : SpearmanrResult(correlation=0.1418184646888764, pvalue=0.0)
# credit_info : SpearmanrResult(correlation=0.10687638355172512, pvalue=2.4225556926981052e-241)
# act_info : SpearmanrResult(correlation=-0.052652662730595476, pvalue=8.596256744325483e-60)
# 为了评估评分卡模型在跨时间的可靠性,我map需要预留最近一个月的数据
train = data[data.obs_mth != '2018-11-30']
overtime_test = data[data.obs_mth == '2018-11-30']
data.columns
#out
# Index(['obs_mth', 'bad_ind', 'uid', 'td_score', 'jxl_score', 'mj_score',
# 'rh_score', 'zzc_score', 'zcx_score', 'person_info', 'finance_info',
# 'credit_info', 'act_info'],
# dtype='object')
feature_1st = data.columns[3:]#取出所有特征
取出训练集和跨时间测试集
x_train = train[feature_1st]
y_train = train['bad_ind']
overtime_x = overtime_test[feature_1st]
overtime_y = overtime_test['bad_ind']
先直接使用LR对所有特征训练
lr_model_all = LogisticRegression(C=0.1)
lr_model_all.fit(x_train,y_train)
#out:
# LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
# intercept_scaling=1, l1_ratio=None, max_iter=100,
# multi_class='auto', n_jobs=None, penalty='l2',
# random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
# warm_start=False)
模型评估
对于评分卡模型来说,最重要的是能够通过逻辑回归的得到一个bad_ind为1的概率,对于有着相同概率的一批样品,他们中的确实为1的占比要与这个概率有着严格单调的关系
KS值:根据学习器的预测结果(注意,是正例的概率值,非0/1变量)对样本进行排序(从大到小)-----这就是截断点依次选取的顺序 按顺序选取截断点,并计算TPR和FPR ---也可以只选取n个截断点,分别在1/n,2/n,3/n等位置 横轴为样本的占比百分比(最大100%),纵轴分别为TPR和FPR,可以得到KS曲线 TPR和FPR曲线分隔最开的位置就是最好的”截断点“,最大间隔距离就是KS值,通常>0.2即可认为模型有比较好偶的预测准确性。
y_bad_proba = lr_model_all.predict_proba(x_train)[:,1] #取出训练集预测值
# y_score[0]#out array([0.9929346, 0.0070654])
fpr_lr_train,tpr_lr_train,_ = roc_curve(y_train,y_bad_proba)
train_ks = abs(fpr_lr_train - tpr_lr_train).max() #计算训练集KS
print('train_ks : ',train_ks)
#还要就计算跨时间验证集的KS
y_bad_proba_overtime = lr_model_all.predict_proba(overtime_x)[:,1] #计算验证集预测值
fpr_lr,tpr_lr,_ = roc_curve(overtime_y,y_bad_proba_overtime) #计算验证集预测值
overtime_ks = abs(fpr_lr - tpr_lr).max() #计算验证集KS值
print('overtime_ks : ',overtime_ks)
train_ks : 0.4196708292433754
overtime_ks : 0.3822931458847406
plt画图
from matplotlib import pyplot as plt
plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') #绘制训练集ROC
plt.plot(fpr_lr,tpr_lr,label = 'overtime LR') #绘制验证集ROC
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC Curve')
plt.legend(loc = 'best')
plt.show()
结果可以发现KS值接近0.42,在跨时间验证集上为0.38,下降较为明显考虑先做特征筛选在进行训练
#特征筛选一 计算VIF值 方差膨胀系数
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = np.array(x_train)
for i in range(X.shape[1]):
print(feature_1st[i],':',variance_inflation_factor(X,i))
#OUT:
# td_score : 3.4711624479660337
# jxl_score : 3.4813157985309386
# mj_score : 3.5001665990318536
# rh_score : 3.481525651006003
# zzc_score : 3.4656930691044368
# zcx_score : 3.4799441491006324
# person_info : 1.3061065168399477
# finance_info : 1.9899007773938198
# credit_info : 1.2911706397676723
# act_info : 3.1047613119997775
# 可以利用方差膨胀系数的结果筛选VIF较小的特征
#用lightGBM进行特征筛选
import lightgbm as lgb
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x_train,y_train,random_state=0,test_size=0.2)
def lgb_test(train_x,train_y,test_x,test_y):
clf =lgb.LGBMClassifier(boosting_type = 'gbdt',
objective = 'binary',
metric = 'auc',
learning_rate = 0.1,
n_estimators = 24,
max_depth = 5,
num_leaves = 20,
max_bin = 45,
min_data_in_leaf = 10,
bagging_fraction = 0.6,
bagging_freq = 0,
feature_fraction = 0.8,
)
clf.fit(train_x,train_y,eval_set = [(train_x,train_y),(test_x,test_y)],eval_metric = 'auc')
return clf,clf.best_score_['valid_1']['auc'],
lgb_model , lgb_auc = lgb_test(train_x,train_y,test_x,test_y)
feature_importance = pd.DataFrame({'name':lgb_model.booster_.feature_name(),
'importance':lgb_model.feature_importances_}).sort_values(by=['importance'],ascending=False)
feature_importance
选取高importance的特征
#用新的特征再做一次LR
feature_2nd= ['person_info','finance_info','credit_info','act_info']
x = train[feature_2nd]
y = train['bad_ind']
val_x = overtime_test[feature_2nd]
val_y = overtime_test['bad_ind']
lr_model = LogisticRegression(C=0.1,class_weight='balanced')
lr_model.fit(x,y)
y_pred = lr_model.predict_proba(x)[:,1]
fpr_lr_train,tpr_lr_train,_ = roc_curve(y,y_pred)
train_ks = abs(tpr_lr_train - fpr_lr_train).max()
print('train_ks : ',train_ks)
y_pred = lr_model.predict_proba(val_x)[:,1]
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)
val_ks = abs(tpr_lr - fpr_lr).max()
print('val_ks : ',val_ks)
from matplotlib import pyplot as plt
plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')
plt.plot(fpr_lr,tpr_lr,label = 'evl LR')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC Curve')
plt.legend(loc = 'best')
plt.show()
# train_ks : 0.4482325608488951 val_ks : 0.4198642457760936
筛选特征后可以看到KS系数的结果比之前好了一些
#生成报告
model = lr_model
row_num, col_num = 0, 0
bins = 20 #分20箱
Y_predict = [s[1] for s in model.predict_proba(val_x)]
Y = val_y
nrows = Y.shape[0]
lis = [(Y_predict[i], Y.array[i]) for i in range(nrows)]
ks_lis = sorted(lis, key=lambda x: x[0], reverse=True)
bin_num = int(nrows/bins+1)# 计算每组有多少条数据
bad = sum([1 for (p, y) in ks_lis if y > 0.5]) # 计算逾期人数
good = sum([1 for (p, y) in ks_lis if y <= 0.5]) # 计算好人的人数
bad_cnt, good_cnt = 0, 0 # 累计坏人人数 ,累计好人的人数
KS = []
BAD = []
GOOD = []
BAD_CNT = []
GOOD_CNT = []
BAD_PCTG = []
BADRATE = []
dct_report = {}
for j in range(bins):
ds = ks_lis[j*bin_num: min((j+1)*bin_num, nrows)]
bad1 = sum([1 for (p, y) in ds if y > 0.5])
good1 = sum([1 for (p, y) in ds if y <= 0.5])
bad_cnt += bad1
good_cnt += good1
bad_pctg = round(bad_cnt/sum(val_y),3) # 一箱一箱累加 到这一箱一共有多少逾期 占所有逾期的比例
badrate = round(bad1/(bad1+good1),3) # 一箱一箱累加 到这一箱一共有多少逾期占所有人的比例
ks = round(math.fabs((bad_cnt / bad) - (good_cnt / good)),3) # 计算KS值
KS.append(ks)
BAD.append(bad1)
GOOD.append(good1)
BAD_CNT.append(bad_cnt)
GOOD_CNT.append(good_cnt)
BAD_PCTG.append(bad_pctg)
BADRATE.append(badrate)
dct_report['KS'] = KS
dct_report['BAD'] = BAD
dct_report['GOOD'] = GOOD
dct_report['BAD_CNT'] = BAD_CNT
dct_report['GOOD_CNT'] = GOOD_CNT
dct_report['BAD_PCTG'] = BAD_PCTG
dct_report['BADRATE'] = BADRATE
val_repot = pd.DataFrame(dct_report)
val_repot
报告看到每一组的BAD数量并不是严格的单调的,说明模型还有待调整,直到严格单调递减。
调整完后
模型预测的概率值映射为一个负相关的值+基准值 = 得分(类似芝麻信用分)
-----------------------end