# coding=utf-8
import pandas as pd # 表格处理容器
import numpy as np # 科学计算
import sys # 设置断点
'''
第一步:数据清洗(finance_class_demo.py)
第二步:清洗数据建模流程(create_data_model.py)
'''
print('数据清洗开始...')
# skip跳跃,跳读 skiprows跳过第几行,low_memory低内存运行数据(会改变数值类型,不同类型的数值不能相加),memory记忆
df = pd.read_csv("./data/LoanStats_2018Q1.csv", skiprows=1, low_memory=True)
# print(df.head(10)) # 打印前10行特征列
# print(df.info()) # 打印信息[10 rows行 x 145 columns柱列]
# 要删除的特征列title,axis=0表示纵轴(包含title),axis=1表示横轴(包含title),inplace表示是否保留
df.drop('id', axis=1, inplace=True)
df.drop('member_id', axis=1, inplace=True)
# print(df.head(10))
# .方法与df['term'] 一样;^ 在正则匹配中除了..except;匹配到的结果替换成参数value
# regex正则匹配需要打开
df.term.replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
# 清洗数据,去除特征中特殊字符,int_rate是特征feature
df.int_rate.replace('%', value='', inplace=True)
df.drop('sub_grade', axis=1, inplace=True)
# print(df.emp_title.value_counts()) # 对某一特征类型进行统计
df.drop('emp_title', axis=1, inplace=True)
# np.nan 表示匹配到的n/a,用nan填充,才真正的没有
df.emp_length.replace('n/a', np.nan, inplace=True)
df.emp_length.replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
# 一步到位,删除表格数据axis中的空值;how属性表示所有,any表示包含某一项
df.dropna(axis=0, how='all', inplace=True)
df.dropna(axis=1, how='all', inplace=True)
# 把空值删除完之后才会显示每行title特征的信息
# print(df.info())
# 删除数据量少的axis=1上的列值特征
df.drop(['debt_settlement_flag', 'settlement_status', 'settlement_date',
'settlement_amount', 'settlement_percentage', 'settlement_term'], axis=1, inplace=True)
'''
删除不为空,但特征重复比较多的列:
eg:全是0的,或者全是f的,或者全是individual,或者全是n,或者大多数重复列
删除方法:
先删除float类型中重复较多的列,再删object类型中重复较多的列
'''
# columns柱,列;format设计、格式化、排版;unique 独特的;len(df[col].unique) 独特类型的个数
# 返回两个参数;{}可以占位
# for col in df.select_dtypes(include=['float']).columns:
# print('col {} has {}'.format(col, len(df[col].unique())))
'''
删除少的特征列(正与反样本量10:1好)
col delinq_2yrs has 20
col inq_last_6mths has 6
col mths_since_last_delinq has 132
'''
df.drop(['delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
'pub_rec', 'total_acc', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'acc_now_delinq', 'open_acc_6m',
'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'il_util',
'open_rv_12m', 'open_rv_24m', 'all_util', 'inq_fi', 'total_cu_tl',
'inq_last_12m', 'acc_open_past_24mths', 'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_rcnt_tl',
'mort_acc', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq',
'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl',
'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'percent_bc_gt_75', 'pub_rec_bankruptcies',
'tax_liens', 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_open_act_il',
'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med',
'sec_app_mths_since_last_major_derog'], axis=1, inplace=True)
# 查找object类型
# for col in df.select_dtypes(include=['object']).columns:
# print('col {} has {}'.format(col, len(df[col].unique())))
# 删除object少的类型的列
df.drop(['term', 'grade', 'emp_length', 'home_ownership',
'verification_status', 'issue_d', 'pymnt_plan', 'purpose', 'title',
'zip_code', 'addr_state', 'initial_list_status', 'last_pymnt_d', 'next_pymnt_d',
'last_credit_pull_d', 'application_type', 'verification_status_joint', 'hardship_flag', 'disbursement_method',
'debt_settlement_flag_date', 'title', 'earliest_cr_line',
'sec_app_earliest_cr_line'], axis=1, inplace=True)
# 生成新的文件;路径
# df.to_csv('./df_data.csv')
'''
标签二值化:
查看正例与反例得比例:10:1,为最优
因为要查找正例所以:
将正例fully paid 设置为1
反例current 设置为0
相反查找反例将current 设置为1
其他项设置为np.nan
Current 100414
Fully Paid 5390
Late (31-120 days) 873
In Grace Period 668
'''
# print(df.loan_status.value_counts())
df.loan_status.replace('Fully Paid', value=int(1), inplace=True)
df.loan_status.replace('Current', value=int(0), inplace=True)
df.loan_status.replace(['Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)', 'Charged Off', 'Default'],
np.nan, inplace=True)
# print(df.info()) # 查看信息特征
# 同一信息特征,将上一步特征列设置为nan的any某些删除不保留
df.dropna(subset=['loan_status'], how='any', inplace=True)
# print(df.info()) # 查看信息特征
# 把样本中其余的空值全部用0.0填充
df.fillna(0.0, inplace=True)
print('数据清洗已经完成...')
''' =====================数据清洗完成======================= '''
''' =====检测清洗后样本特征的相关性,只保留一列线性相关特征列=====
相似性较高的列(>=0.95):
funded_amnt loan_amnt 1.000000
funded_amnt_inv loan_amnt 0.999996
funded_amnt 0.999996
installment loan_amnt 0.945075
'''
print('去除数据中重复线性相关特征列..')
# 构建协方差矩阵,计算清洁后样本数据的相关性;只保留一列线性相关特征列
# numpy可以构建协方差矩阵;pandas也可以构建表格协方差矩阵
cor = df.corr()
# 对矩阵进行赋值并重新排列;第一个:表示所有列,第二个:表示所有行;k=-1表示正太矩阵的下三角
cor.iloc[:, :] = np.tril(cor, k=-1)
cor = cor.stack()
# print(cor[(cor > 0.55) | (cor < -0.55)])
# 删除相关性大于0.95的列
df.drop(['funded_amnt', 'loan_amnt', 'out_prncp', 'total_pymnt_inv',
'tot_cur_bal'], axis=1, inplace=True)
# 对所有object类型的值进行哑变量处理
df = pd.get_dummies(df)
df.to_csv('./data/feature.csv')
print('导出清洗数据文件已完成!')
# coding=utf-8
import pandas as pd # 表格处理库
from sklearn.model_selection import train_test_split # 切割训练集和测试集
from sklearn.linear_model.logistic import LogisticRegression # 逻辑回归
from sklearn import metrics # 有监督的评判标准
from sklearn.ensemble.forest import RandomForestClassifier # 随机深林
from sklearn.ensemble import GradientBoostingClassifier # GBDT
from sklearn import svm
import time
from sklearn.model_selection import GridSearchCV
import matplotlib as mpl # 画图 提取特征
import matplotlib.pyplot as plt # 画图
import numpy as np
import sys
'''
第二部:清洗数据建模
'''
# 读取清洗文件数据,数据量过大已经删除
df = pd.read_csv('./data/feature.csv', low_memory=True)
print('读取清洗数据文件,开始预测: ')
# 对Y标签进行特征赋值,并从X向量中删除
Y = df.loan_status
X = df.drop('loan_status', axis=1, inplace=False)
# 对数据进行训练集、测试集切分;test_size=0.3表示30%样本做测试集,70%样本做训练集
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
# 闯进逻辑回归对象
print('='*10 + '逻辑回归' + '='*10)
lr = LogisticRegression()
# 开始时间
start = time.time()
print('开始逻辑回归模型预测:')
# 对训练集进行训练
lr.fit(x_train, y_train)
# 对训练集进行预测
train_predict = lr.predict(x_train)
train_f1 = metrics.f1_score(train_predict, y_train)
# 训练集上的精确率
train_acc = metrics.accuracy_score(train_predict, y_train)
# 训练集上的召回率
train_rec = metrics.recall_score(train_predict, y_train)
print('逻辑回归模型在训练集上预测Y的效果如下:')
print('在训练集上f1_mean的值为%.4f' % train_f1, end=' ')
print('在训练集上的准确率的值为%.4f' % train_acc, end=' ')
print('在训练集上的查全率(召回率)的值为%.4f' % train_rec)
# 对测试集进行预测
test_predict = lr.predict(x_test)
test_f1 = metrics.f1_score(test_predict, y_test)
test_acc = metrics.accuracy_score(test_predict, y_test)
test_rec = metrics.recall_score(test_predict, y_test)
print('逻辑回归模型在测试集上预测Y的效果如下:')
print('在测试集上f1_mean的值为%.4f' % test_f1, end=' ')
print('在测试集上准确率的值为%.4f' % test_acc, end=' ')
print('在测试集上查全率(召回率)的值为%.4f' % test_rec)
end = time.time()
end = end - start
print('逻辑回归训练集、测试集预测结束,run_time:%.2f' % end)
print('='*30)
print('='*10 + '随机深林' + '='*10)
print('开始随机深林模型预测:')
rf = RandomForestClassifier()
start = time.time()
rf.fit(x_train, y_train)
train_predict = rf.predict(x_train)
train_f1 = metrics.f1_score(train_predict, y_train)
train_acc = metrics.accuracy_score(train_predict, y_train)
train_rec = metrics.recall_score(train_predict, y_train)
print('在训练集上f1_mean的值为%.4f' % train_f1, end=' ')
print('在训练集上的准确率的值为%.4f' % train_acc, end=' ')
print('在训练集上的查全率(召回率)的值为%.4f' % train_rec)
test_predict = rf.predict(x_test)
test_f1 = metrics.f1_score(test_predict, y_test)
test_acc = metrics.accuracy_score(test_predict, y_test)
test_rec = metrics.recall_score(test_predict, y_test)
print('在测试集上f1_mean的值为%.4f' % train_f1, end=' ')
print('在测试集上的准确率的值为%.4f' % test_acc, end=' ')
print('在测试集上的查全率(召回率)的值为%.4f' % train_rec)
end = time.time()
end = end - start
print('随机深林训练集、测试集预测结束,run_time:%.2f' % end)
print('='*30)
# print('='*10 + 'GBDT' + '='*10)
print('='*10 + '开始作图' + '='*10)
# 提取重要特征并进行排名;这是一个接口
feature_importance = rf.feature_importances_
feature_importance = 100.0*(feature_importance/feature_importance.max())
# np.argsort倒序排名,取后10个,最大的10个
index = np.argsort(feature_importance)[-10:]
# 画一个barh直方图,alpha透明度
plt.barh(np.arange(10), feature_importance[index], color='dodgerblue', alpha=0.4)
print(np.array(X.columns)[index])
plt.yticks(np.arange(10+0.25), np.array(X.columns)[index])
plt.xlabel('Relative importance')
plt.title('Top 10 Importance Variable')
plt.show()
https://pan.baidu.com/s/1UDBKjuRcBnkTV2d7HUCMHQ 提取码:ql84