Predicting Credit Default Risk

Background

As a loan service platform, the company provides loan services for the vast number of small business owners, individual industrial, and ordinary wage earners. And, unfortunately, this population is often taken advantage of by untrustworthy lenders.
Home Credit strives to broaden financial inclusion for the unbanked population by providing a positive and safe borrowing experience. Our main goal is to predict whether the customers defaults on the loan with multidimensional data analysis and machine learning model.
Since risk control should consider both risks and benefits, AUC is required to evaluate the model. The greater AUC is, the better the model performance is..

Mind Map

predict fraud.jpeg

1.Exploratory Data Analysis

1.1 Import Module

import pandas as pd
import numpy as np

#Common Model Helpers
from sklearn import metrics
from sklearn.model_selection import train_test_split

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif']=['SimHei'] 
plt.rcParams['axes.unicode_minus']=False  

import gc
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

1.2 Read Data

df=pd.read_csv(r'D:/BaiduNetdiskDownload/train.csv')
df_test=pd.read_csv(r'D:/BaiduNetdiskDownload/test.csv')

1.3 View Data Information

image.png

1.3.1 The data

train data: 800,000 rows *47 columns(includes label column)
test data: 200,000 rows *46 columns

image.png

image.png

1.3.2 Mean, maximum, minimum, and quartile Values

# data describe
df.describe()

image.png

f3=pd.melt(df,value_vars=numerical_feature)
g=sns.FacetGrid(f3,col="variable",col_wrap=4,sharex=False,sharey=False)
g=g.map(sns.boxplot,'value')

image.png

Here is the boxplot about train data,it showed the distrubition of every single features.

1.3.3 Amount of null values and data types

df.info()

image.png

import toad
toad.detector.detect(df).sort_values('missing',ascending=False)

image.png

'missing' refers to the proportion of missing data.

df_isnull_sum=pd.DataFrame(df.isnull().sum())
df_isnull_sum=df_isnull_sum[df_isnull_sum[0]>0]
plt.figure(figsize=(20,5))
sns.set(style='darkgrid')
sns.barplot(y=df_isnull_sum.sort_values(0,ascending=False).index,x=df_isnull_sum.sort_values(0,ascending=False)[0],palette="Blues_r")
plt.title('Empty data summary')

image.png

The figure above shows the amount of missing data.

1.3.4 Unique values

for i in df.columns:
    if df[i].nunique()<=1:
        print('Only', i ,'with' ,df[i].nunique() ,'unique values.')

Only 'policyCode' with 1 unique values.

toad.detector.detect(df).sort_values('unique',ascending=False)

image.png

1.3.5 Divide numerical continuous variables and discrete variables

1.3.5.1 continuous variables

def get_numercial_serial_features(data,feas):
    numerical_serial_feature=[]
    numerical_noserial_feature=[]
    for fea in feas:
        temp=data[fea].nunique()
        if temp<=10:
            numerical_noserial_feature.append(fea)
        else:
            numerical_serial_feature.append(fea)
    return numerical_serial_feature,numerical_noserial_feature

numerical_feature=list(df.select_dtypes(exclude='object').columns)
category_feature=list(df.select_dtypes(include='object').columns)
label='isDefault'
numerical_feature.remove(label)
numerical_serial_feature,numerical_noserial_feature=get_numercial_serial_features(df,numerical_feature)

#numerical feature distribution visualization
plt.rcParams['font.sans-serif']=['SimHei'] 
plt.rcParams['axes.unicode_minus']=False  
f = pd.melt(df, value_vars=numerical_serial_feature)
g = sns.FacetGrid(f, col="variable",  col_wrap=4, sharex=False, sharey=False)
g = g.map(sns.distplot,"value", kde_kws = {'bw' : 1}) 
plt.xticks(rotation=90)

image.png

Here is the kdeplot about train data,it showed the distrubition of every single continuous variables features.

1.3.5.2 discrete numerical variables

f1=pd.melt(df.loc[:,numerical_noserial_feature],value_vars=numerical_noserial_feature)
g=sns.FacetGrid(f1,col='variable',hue="variable",col_wrap=4, sharex=False, sharey=False)
g=g.map(sns.countplot,'value')

image.png

Here is the barplot about train data,it showed the distrubition of every single discrete numerical variables features.

1.3.5.3 discrete text variables

#f2,((ax1,ax2,ax3),(ax4),(ax5,ax6,ax7))=plt.subplots(3,3,figsize=(20,10))
plt.figure(figsize=(30,30))
sns.set(style='darkgrid')

ax1=plt.subplot(331)
sns.countplot(data=df,x='grade',order=sorted(df['grade'].unique()),ax=ax1)
ax1.set_xticklabels(sorted(df['grade'].unique()),fontsize=15)
ax1.set_title('Amount of Grades',fontsize=20)

ax2=plt.subplot(332)
sns.countplot(data=df,x='subGrade',order=sorted(df['subGrade'].unique()),ax=ax2)
ax2.set_xticklabels(sorted(df['subGrade'].unique()),rotation=90,fontsize=15)
ax2.set_title('Amount of subGrades',fontsize=20)

ax3=plt.subplot(333)
employmentLength_order=['< 1 year','1 year','2 years','3 years', '4 years','5 years','6 years','7 years','8 years','9 years','10+ years']
sns.countplot(data=df,x='employmentLength',order=employmentLength_order,ax=ax3)
ax3.set_xticklabels(employmentLength_order,fontsize=15,rotation=45)
ax3.set_title('Amount of employmentLengths',fontsize=20)

ax4=plt.subplot(312)
sns.countplot(df['issueDate'],order=sorted(df['issueDate'].unique()),ax=ax4)
plt.xticks(range(1,len(df['issueDate'].unique()),3),rotation=45,fontsize=15)
ax4.set_title('Amount of issueDate',fontsize=20,ha='center')

a=df['earliesCreditLine'].apply(lambda x:x.split('-')[1])
ax5=plt.subplot(313)
sns.countplot(a,order=sorted(a.unique()),ax=ax5)
plt.xticks(range(1,len(a.unique()),1),rotation=45,fontsize=15)
ax5.set_title('Amount of earliesCreditLine',fontsize=20,ha='center')

plt.subplots_adjust(wspace =0.2, hspace =0.3)
plt.show()
plt.tight_layout()

image.png

Here is the barplot about train data,it showed the distrubition of every single discrete text variables features.

1.3.6 The distribution difference between defaulting and non-defaulting customers in loan interest rate, installment amount and annual income

image.png

2.Feature Engineering

2.1 Data Preprocessing

According to the situation of data visualization, the text features are converted numerically first.
It is worth noting that the conversion of the loan grade and the sub-grades should correspond to the values in an orderly manner according to the grades.
The data cannot be automatically converted by label encoding, otherwise the algorithm will misunderstand the meaning of the values of the above two features in the subsequent machine learning.

#grade map
grade_dict = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6}
#employmentLength map
employmentLength_dict = {'1 year':1,'10+ years':10,'2 years':2,'3 years':3,'4 years':4,
                         '5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'< 1 year':0}
#define coversion of subgrade
def get_sub_grade(grade, sub):
    return grade*10+int(sub[1])
#define coversion of issueDate
def trans_issueDate(issueDate):
    year,month,day = issueDate.split('-')
    return int(year)*12+int(month)-1
#define coversion of earliesCreditLine
def trans_earliesCreditLine(earliesCreditLine):
    month_dict = {"Jan":1, "Feb":2, "Mar":3, "Apr":4, "May":5, "Jun":6, "Jul":7, "Aug":8, "Sep":9, "Oct":10, "Nov":11, "Dec":12}
    month,year = earliesCreditLine.split('-')
    month = month_dict[month]
    return int(year)*12+month-1
for df in dfs:
    print(df.shape)
    df['grade'] = df['grade'].apply(lambda x: x if x not in grade_dict else grade_dict[x])
    df['subGrade'] = df.apply(lambda row: get_sub_grade(row['grade'],row['subGrade']), axis=1)
    df['employmentLength'] = df['employmentLength'].apply(lambda x: x if x not in employmentLength_dict else employmentLength_dict[x])
    #df['issueYear'] = df['issueDate'].apply(lambda x: int(x.split('-')[0]))
    df['issueDate'] = df['issueDate'].apply(lambda x: trans_issueDate(x))
    df['earliesCreditLine'] = df['earliesCreditLine'].apply(lambda x: trans_earliesCreditLine(x))
    df['dti'] = np.abs(df['dti'].fillna(1000))

2.2 Feature Creation

2.2.1 Create business feature

dfs=[train_data, test_data]
concated_df = pd.concat(dfs)
for df in dfs:
    df['date_Diff'] = df['issueDate'] - df['earliesCreditLine']
    df['installment_term_revolBal'] = df['installment']*12*df['term']/(df['revolBal']+0.1)
    df['revolUtil_revolBal'] = df['revolUtil']/(df['revolBal']+0.1)
    df['openAcc_totalAcc'] = df['openAcc']/df['totalAcc']
    df['loanAmnt_dti_annualIncome'] = df['loanAmnt']/(np.abs(df['dti'])*df['annualIncome']+0.1)
    df['employmentLength_bin'] = df['employmentLength']
    df['issueDate_bin'] = df['issueDate']
    df['earliesCreditLine_bin'] = df['earliesCreditLine']
    df['term_bin'] = df['term']
    df['homeOwnership_bin'] = df['homeOwnership']
    df['annualIncome_loanAmnt'] = df['annualIncome']/(df['loanAmnt']+0.1)
    df['revolBal_loanAmnt'] = df['revolBal']/(df['loanAmnt']+0.1)
    df['revolBal_installment'] = df['revolBal']/(df['installment']+0.1)
    df['annualIncome_installment'] = df['annualIncome']/(df['installment']+0.1)

2.2.2 The continuous feature is divided into bins

label_lst = []
#annualIncome、loanAmnt分成10份
bin_number = 10
for i in range(bin_number):
    label_lst.append(i)
dfs[0]['annualIncome_bin'] = pd.qcut(concated_df['annualIncome'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['loanAmnt_bin'] = pd.qcut(concated_df['loanAmnt'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[1]['annualIncome_bin'] = pd.qcut(concated_df['annualIncome'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
dfs[1]['loanAmnt_bin'] = pd.qcut(concated_df['loanAmnt'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
#interestRate、dti、installment、revolBal、revolUtil分成100份
label_lst = []
bin_number = 100
for i in range(bin_number):
    label_lst.append(i)
dfs[0]['interestRate_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['dti_bin'] = pd.qcut(concated_df['dti'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['installment_bin'] = pd.qcut(concated_df['installment'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['revolBal_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['revolUtil_bin'] = pd.qcut(concated_df['revolUtil'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]

dfs[1]['interestRate_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
dfs[1]['dti_bin'] = pd.qcut(concated_df['dti'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
dfs[1]['installment_bin'] = pd.qcut(concated_df['installment'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
dfs[1]['revolBal_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
dfs[1]['revolUtil_bin'] = pd.qcut(concated_df['revolUtil'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]

2.2.3 Create logical feature

Made a crossover between continuous variables including 'loanAmnt', 'installment', 'interestRate', 'annualIncome', 'dti', 'openAcc', 'revolBal', 'revolUtil', 'totalAcc' and discrete variables including 'issueDate','employmentLength','purpose','homeOwnership'.
Take loanAmnt and employmentLength for example,when employmentLength equals 2, the median loanAmnt is 12000 so that every single 'loanAmnt_employmentLength_ratio' is loanAmnt/12000.

for df in[train_data, test_data]:
    for cate in cate_features:
        df[cate] = df[cate].fillna(0).astype('int')
issueDate_lst = list(set(concated_df['issueDate']))
ratio_feat_lst = ['loanAmnt', 'installment', 'interestRate', 'annualIncome', 'dti', 'openAcc', \
                  'revolBal', 'revolUtil', 'totalAcc']
issueDate_lst = list(set(concated_df['issueDate']))
employmentLength_lst = list(set(concated_df['employmentLength']))
purpose_lst = list(set(concated_df['purpose']))
homeOwnership_lst = list(set(concated_df['homeOwnership']))
for feat in ratio_feat_lst:
    issueDate_median = {}
    issueDate_item_rank = {}
    issueDate_label_mean = {}
    for dt in issueDate_lst:
        # Take the last 6 months
        mask = (concated_df['issueDate'] >= dt-3)&(concated_df['issueDate'] <= dt+3)
        # Take the last 6 months apart from the current month
        mask_1 = (concated_df['issueDate'] >= dt-3)&(concated_df['issueDate'] <= dt+3)&(concated_df['issueDate'] != dt)
        item_series = concated_df.loc[mask, feat]
        label_series = concated_df.loc[mask_1, 'isDefault']
        # Take the median of the last 6 months
        issueDate_median[dt] = item_series.median()
        issueDate_label_mean[dt] = label_series.mean()
        item_rank = item_series.rank()/len(item_series)
        issueDate_item_rank[dt] = {}
        for item,rank in zip(item_series, item_rank):
            issueDate_item_rank[dt][item] = rank
    employmentLength_median = {}
    for et in employmentLength_lst:
        mask = concated_df['employmentLength'] == et
        item_series = concated_df.loc[mask, feat]
        employmentLength_median[et] = item_series.median()
    purpose_median = {}
    for pp in purpose_lst:
        mask = concated_df['purpose'] == pp
        item_series = concated_df.loc[mask, feat]
        purpose_median[pp] = item_series.median()
    homeOwnership_median = {}
    for ho in homeOwnership_lst:
        mask = concated_df['homeOwnership'] == ho
        item_series = concated_df.loc[mask, feat]
        homeOwnership_median[ho] = item_series.median()
    for df in [train_data, test_data]:
        print(feat, df.shape)
        df['label_issueDate_mean'] = df['issueDate'].apply(lambda x: issueDate_label_mean[x])
        df[feat+'_issueDate_median'] = df['issueDate'].apply(lambda x: issueDate_median[x])
        #df['interestRate_ratio'] = df['interestRate']/df['interestRate_median']
        df[feat+'_issueDate_ratio'] = df.fillna(0).apply(lambda r: issueDate_item_rank[r['issueDate']][r[feat]], axis=1)
        df[feat+'_employmentLength_ratio'] = df.fillna(0).apply(lambda r: r[feat]/employmentLength_median[r['employmentLength']], axis=1)
        df[feat+'_purpose_ratio'] = df.fillna(0).apply(lambda r: r[feat]/purpose_median[r['purpose']], axis=1)
        df[feat+'_homeOwnership_ratio'] = df.fillna(0).apply(lambda r: r[feat]/homeOwnership_median[r['homeOwnership']], axis=1)
        print(feat, df.shape)

image.png

After processing above, created 67 new features.

2.3 Feature Selection

Look for stability indicators using TOAD's PSI module

feat_lst = list(test_data.columns[1:])  
psi_df = toad.metrics.PSI(train_data[feat_lst], test_data[feat_lst]).sort_values(0)  
psi_df

image.png

PSI reflects the distribution of validation samples in each fraction segment and the stability of modeling sample distribution. In modeling, we often screen features variables and evaluate model stability. If the model is unstable, it means that the model is not controllable, which is an uncertain risk for the business itself and directly affects the rationality of the decision. This is unacceptable.
Remove features with PSI greater than 0.25.

feat_lst.remove('installment_homeOwnership_ratio')  
feat_lst.remove('installment_purpose_ratio')  
feat_lst.remove('revolBal_issueDate_ratio')  
feat_lst.remove('revolBal_loanAmnt')  
feat_lst.remove('annualIncome_installment')  
feat_lst.remove('installment_issueDate_ratio')  
feat_lst.remove('installment_employmentLength_ratio')  
feat_lst.remove('revolUtil_issueDate_ratio')  
feat_lst.remove('revolBal_purpose_ratio')  
feat_lst.remove('revolBal_homeOwnership_ratio')  
feat_lst.remove('revolBal_employmentLength_ratio')  
feat_lst.remove('dti_issueDate_ratio')

3.Modeling

3.1 Model Selection

3.1.1 Import required modules

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

3.1.2 Encapsulation model

def xgb_model(X_train, y_train, X_test, y_test):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = xgb.DMatrix(X_train_split , label=y_train_split)
    valid_matrix = xgb.DMatrix(X_val , label=y_val)
    test_matrix = xgb.DMatrix(X_test)

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 1,
        'min_child_weight': 1.5,
        'max_depth': 5,
        'lambda': 10,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'colsample_bylevel': 0.7,
        'eta': 0.04,
        'tree_method': 'exact',
        'seed': 2020,
        'n_jobs': -1,
        "silent": True,
    }
    watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
    model = xgb.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
    """Calculate the score on the validation set"""
    val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后xgboost单模型在验证集上的AUC：{}'.format(roc_auc))
    """Make a prediction for the test set"""
    test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
    fpr, tpr, threshold = metrics.roc_curve(y_test,test_pred)
    roc_auc1 = metrics.auc(fpr, tpr)
    print('调参后xgboost单模型在测试集上的AUC：{}'.format(roc_auc1))
    return test_pred

def lgb_model(X_train, y_train, X_test, y_test):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)
    
    # The optimal parameter after tuning
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.01,
        'min_child_weight': 0.32,
        'num_leaves': 14,
        'max_depth': 4,
        'feature_fraction': 0.81,
        'bagging_fraction': 0.61,
        'bagging_freq': 9,
        'min_data_in_leaf': 13,
        'min_split_gain': 0.27,
        'reg_alpha': 9.58,
        'reg_lambda': 4.62,
        'seed': 2020,
        'n_jobs':-1,
        'silent': True,
        'verbose': -1,
    }
    
    model = lgb.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500, early_stopping_rounds=500)
    """Calculate the score on the validation set"""
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))
    """Make a prediction for the test set"""
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    fpr, tpr, threshold = metrics.roc_curve(y_test,test_pred)
    roc_auc1 = metrics.auc(fpr, tpr)
    print('调参后lightgbm单模型在测试集上的AUC：{}'.format(roc_auc1))
    return test_pred

def cat_model(X_train, y_train, X_test, y_test):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    model = CatBoostClassifier(iterations=2500,    cat_features=cate_features,eval_metric='AUC',logging_level='Verbose',
                               learning_rate=0.05, depth=6, l2_leaf_reg=5, loss_function='CrossEntropy')
    model.fit(X_train_split,y_train_split, eval_set=(X_val, y_val), plot=False)
    """Calculate the score on the validation set"""
    val_pred = model.predict_proba(X_val)[:,1]
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))
    """Make a prediction for the test set"""
    test_pred = model.predict_proba(X_test)[:,1]
    fpr, tpr, threshold = metrics.roc_curve(y_test,test_pred)
    roc_auc1 = metrics.auc(fpr, tpr)
    print('调参后lightgbm单模型在测试集上的AUC：{}'.format(roc_auc1))
    return test_pred

3.1.3 Comparison of single model scores

3.1.3.1 XGBoost

xgb_pred=xgb_model(Xtrain,Ytrain,Xtest, Ytest)

image.png

3.1.3.2 LightGBM

lgb_pred=lgb_model(Xtrain,Ytrain,Xtest, Ytest)

image.png

3.1.3.3 Catboost

cat_pred=cat_model(Xtrain,Ytrain,Xtest, Ytest)

image.png

Catboost with a score of 0.74+ was finally selected for modeling.

3.2 Adjust the parameters

from sklearn.model_selection import GridSearchCV
params = {'depth': [2,5,8],
          'learning_rate' : [0.05,0.1,0.15],
          'l2_leaf_reg': [2,5,8],
          'iterations': [10000],
          'early_stopping_rounds':[300],
           'loss_function':['CrossEntropy','Logloss']  
         }
cb_estimator=CatBoostClassifier(cat_features=cate_features1,eval_metric='AUC',logging_level='Verbose')
X_train, X_validation, y_train, y_validation = train_test_split(train_data.loc[:, feat_lst],  
train_data.loc[:, 'isDefault'],  test_size=0.125,random_state=2) 

cb_model = GridSearchCV(cb_estimator, param_grid = params, scoring="roc_auc", cv = 2)
cb_model.fit(X_train,y_train,eval_set=(X_validation,y_validation))

cb_model.best_params_

image.png

Next, train the model according to the best parameters.

3.3 Training model

model_lst = []  
#Discrete features variables
cate_features = ['employmentTitle', 'employmentLength_bin', 'purpose', 'postCode', 'subGrade', 'earliesCreditLine_bin', \
'regionCode', 'title', 'issueDate_bin', 'term_bin',\
'interestRate_bin', 'annualIncome_bin', 'loanAmnt_bin','homeOwnership_bin',\
'revolBal_bin','dti_bin','installment_bin','revolBal_bin','revolUtil_bin']  
#
pred_data=pd.DataFrame()
for i in range(3):  
    X_train, X_validation, y_train, y_validation = train_test_split(train_data.loc[:, feat_lst],  
train_data.loc[:, 'isDefault'],  
test_size=0.125 , random_state=i*1000)  
    model = CatBoostClassifier(iterations=10000,    cat_features=cate_features,eval_metric='AUC',logging_level='Verbose',  
learning_rate=0.1, depth=6, l2_leaf_reg=5, loss_function='CrossEntropy',early_stopping_rounds=500)  
    print(X_train.loc[:, feat_lst].shape,  
y_train.shape,  
X_validation.loc[:, feat_lst].shape,  
y_validation.shape)  
    model.fit(X_train.loc[:, feat_lst],y_train, eval_set=(X_validation.loc[:, feat_lst], y_validation), plot=False)  
    preds = model.predict_proba(test_data[feat_lst])[:, 1] 
    pred_data[i]=preds
pred_data

image.png

Give proper weight to the predicted results

total_score=(0.7494391347+0.7497655984+0.7485861886)-0.74*3
first_weight=(0.7494391347-0.74)/total_score
second_weight=(0.7497655984-0.74)/total_score
third_weight=(0.7485861886-0.74)/total_score
pred_data['weight']=pred_data[0]*first_weight+pred_data[1]*second_weight+pred_data[2]*third_weight
pred_data

image.png

Submission

submit=pd.DataFrame(np.arange(800000,1000000,1),columns=['id'])
submit['isDefault']=pred_data['weight']
submit[['id','isDefault']].to_csv('submit.csv', index=False)

In the end, ranked 43 out of 9,611 participants, ranking in the top 0.5%.
There is still a lot of room for improvement due to inadequate computer performance and limited time, such as lightly adjusting model parameters;creating more features;more model stacking.

image.png

Predicting Credit Default Risk