Background
- As a loan service platform, the company provides loan services for the vast number of small business owners, individual industrial, and ordinary wage earners. And, unfortunately, this population is often taken advantage of by untrustworthy lenders.
- Home Credit strives to broaden financial inclusion for the unbanked population by providing a positive and safe borrowing experience. Our main goal is to predict whether the customers defaults on the loan with multidimensional data analysis and machine learning model.
- Since risk control should consider both risks and benefits, AUC is required to evaluate the model. The greater AUC is, the better the model performance is..
Mind Map

1.Exploratory Data Analysis
1.1 Import Module
import pandas as pd
import numpy as np
#Common Model Helpers
from sklearn import metrics
from sklearn.model_selection import train_test_split
#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
import gc
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
1.2 Read Data
df=pd.read_csv(r'D:/BaiduNetdiskDownload/train.csv')
df_test=pd.read_csv(r'D:/BaiduNetdiskDownload/test.csv')
1.3 View Data Information


1.3.1 The data
- train data: 800,000 rows *47 columns(includes label column)
-
test data: 200,000 rows *46 columns
image.png
image.png
1.3.2 Mean, maximum, minimum, and quartile Values
# data describe
df.describe()

f3=pd.melt(df,value_vars=numerical_feature)
g=sns.FacetGrid(f3,col="variable",col_wrap=4,sharex=False,sharey=False)
g=g.map(sns.boxplot,'value')

Here is the boxplot about train data,it showed the distrubition of every single features.
1.3.3 Amount of null values and data types
df.info()

import toad
toad.detector.detect(df).sort_values('missing',ascending=False)

'missing' refers to the proportion of missing data.
df_isnull_sum=pd.DataFrame(df.isnull().sum())
df_isnull_sum=df_isnull_sum[df_isnull_sum[0]>0]
plt.figure(figsize=(20,5))
sns.set(style='darkgrid')
sns.barplot(y=df_isnull_sum.sort_values(0,ascending=False).index,x=df_isnull_sum.sort_values(0,ascending=False)[0],palette="Blues_r")
plt.title('Empty data summary')

The figure above shows the amount of missing data.
1.3.4 Unique values
for i in df.columns:
if df[i].nunique()<=1:
print('Only', i ,'with' ,df[i].nunique() ,'unique values.')
Only 'policyCode' with 1 unique values.
toad.detector.detect(df).sort_values('unique',ascending=False)

1.3.5 Divide numerical continuous variables and discrete variables
1.3.5.1 continuous variables
def get_numercial_serial_features(data,feas):
numerical_serial_feature=[]
numerical_noserial_feature=[]
for fea in feas:
temp=data[fea].nunique()
if temp<=10:
numerical_noserial_feature.append(fea)
else:
numerical_serial_feature.append(fea)
return numerical_serial_feature,numerical_noserial_feature
numerical_feature=list(df.select_dtypes(exclude='object').columns)
category_feature=list(df.select_dtypes(include='object').columns)
label='isDefault'
numerical_feature.remove(label)
numerical_serial_feature,numerical_noserial_feature=get_numercial_serial_features(df,numerical_feature)
#numerical feature distribution visualization
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
f = pd.melt(df, value_vars=numerical_serial_feature)
g = sns.FacetGrid(f, col="variable", col_wrap=4, sharex=False, sharey=False)
g = g.map(sns.distplot,"value", kde_kws = {'bw' : 1})
plt.xticks(rotation=90)

Here is the kdeplot about train data,it showed the distrubition of every single continuous variables features.
1.3.5.2 discrete numerical variables
f1=pd.melt(df.loc[:,numerical_noserial_feature],value_vars=numerical_noserial_feature)
g=sns.FacetGrid(f1,col='variable',hue="variable",col_wrap=4, sharex=False, sharey=False)
g=g.map(sns.countplot,'value')

Here is the barplot about train data,it showed the distrubition of every single discrete numerical variables features.
1.3.5.3 discrete text variables
#f2,((ax1,ax2,ax3),(ax4),(ax5,ax6,ax7))=plt.subplots(3,3,figsize=(20,10))
plt.figure(figsize=(30,30))
sns.set(style='darkgrid')
ax1=plt.subplot(331)
sns.countplot(data=df,x='grade',order=sorted(df['grade'].unique()),ax=ax1)
ax1.set_xticklabels(sorted(df['grade'].unique()),fontsize=15)
ax1.set_title('Amount of Grades',fontsize=20)
ax2=plt.subplot(332)
sns.countplot(data=df,x='subGrade',order=sorted(df['subGrade'].unique()),ax=ax2)
ax2.set_xticklabels(sorted(df['subGrade'].unique()),rotation=90,fontsize=15)
ax2.set_title('Amount of subGrades',fontsize=20)
ax3=plt.subplot(333)
employmentLength_order=['< 1 year','1 year','2 years','3 years', '4 years','5 years','6 years','7 years','8 years','9 years','10+ years']
sns.countplot(data=df,x='employmentLength',order=employmentLength_order,ax=ax3)
ax3.set_xticklabels(employmentLength_order,fontsize=15,rotation=45)
ax3.set_title('Amount of employmentLengths',fontsize=20)
ax4=plt.subplot(312)
sns.countplot(df['issueDate'],order=sorted(df['issueDate'].unique()),ax=ax4)
plt.xticks(range(1,len(df['issueDate'].unique()),3),rotation=45,fontsize=15)
ax4.set_title('Amount of issueDate',fontsize=20,ha='center')
a=df['earliesCreditLine'].apply(lambda x:x.split('-')[1])
ax5=plt.subplot(313)
sns.countplot(a,order=sorted(a.unique()),ax=ax5)
plt.xticks(range(1,len(a.unique()),1),rotation=45,fontsize=15)
ax5.set_title('Amount of earliesCreditLine',fontsize=20,ha='center')
plt.subplots_adjust(wspace =0.2, hspace =0.3)
plt.show()
plt.tight_layout()

Here is the barplot about train data,it showed the distrubition of every single discrete text variables features.
1.3.6 The distribution difference between defaulting and non-defaulting customers in loan interest rate, installment amount and annual income

2.Feature Engineering
2.1 Data Preprocessing
- According to the situation of data visualization, the text features are converted numerically first.
- It is worth noting that the conversion of the loan grade and the sub-grades should correspond to the values in an orderly manner according to the grades.
- The data cannot be automatically converted by label encoding, otherwise the algorithm will misunderstand the meaning of the values of the above two features in the subsequent machine learning.
#grade map
grade_dict = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6}
#employmentLength map
employmentLength_dict = {'1 year':1,'10+ years':10,'2 years':2,'3 years':3,'4 years':4,
'5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'< 1 year':0}
#define coversion of subgrade
def get_sub_grade(grade, sub):
return grade*10+int(sub[1])
#define coversion of issueDate
def trans_issueDate(issueDate):
year,month,day = issueDate.split('-')
return int(year)*12+int(month)-1
#define coversion of earliesCreditLine
def trans_earliesCreditLine(earliesCreditLine):
month_dict = {"Jan":1, "Feb":2, "Mar":3, "Apr":4, "May":5, "Jun":6, "Jul":7, "Aug":8, "Sep":9, "Oct":10, "Nov":11, "Dec":12}
month,year = earliesCreditLine.split('-')
month = month_dict[month]
return int(year)*12+month-1
for df in dfs:
print(df.shape)
df['grade'] = df['grade'].apply(lambda x: x if x not in grade_dict else grade_dict[x])
df['subGrade'] = df.apply(lambda row: get_sub_grade(row['grade'],row['subGrade']), axis=1)
df['employmentLength'] = df['employmentLength'].apply(lambda x: x if x not in employmentLength_dict else employmentLength_dict[x])
#df['issueYear'] = df['issueDate'].apply(lambda x: int(x.split('-')[0]))
df['issueDate'] = df['issueDate'].apply(lambda x: trans_issueDate(x))
df['earliesCreditLine'] = df['earliesCreditLine'].apply(lambda x: trans_earliesCreditLine(x))
df['dti'] = np.abs(df['dti'].fillna(1000))
2.2 Feature Creation
2.2.1 Create business feature
dfs=[train_data, test_data]
concated_df = pd.concat(dfs)
for df in dfs:
df['date_Diff'] = df['issueDate'] - df['earliesCreditLine']
df['installment_term_revolBal'] = df['installment']*12*df['term']/(df['revolBal']+0.1)
df['revolUtil_revolBal'] = df['revolUtil']/(df['revolBal']+0.1)
df['openAcc_totalAcc'] = df['openAcc']/df['totalAcc']
df['loanAmnt_dti_annualIncome'] = df['loanAmnt']/(np.abs(df['dti'])*df['annualIncome']+0.1)
df['employmentLength_bin'] = df['employmentLength']
df['issueDate_bin'] = df['issueDate']
df['earliesCreditLine_bin'] = df['earliesCreditLine']
df['term_bin'] = df['term']
df['homeOwnership_bin'] = df['homeOwnership']
df['annualIncome_loanAmnt'] = df['annualIncome']/(df['loanAmnt']+0.1)
df['revolBal_loanAmnt'] = df['revolBal']/(df['loanAmnt']+0.1)
df['revolBal_installment'] = df['revolBal']/(df['installment']+0.1)
df['annualIncome_installment'] = df['annualIncome']/(df['installment']+0.1)
2.2.2 The continuous feature is divided into bins
label_lst = []
#annualIncome、loanAmnt分成10份
bin_number = 10
for i in range(bin_number):
label_lst.append(i)
dfs[0]['annualIncome_bin'] = pd.qcut(concated_df['annualIncome'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['loanAmnt_bin'] = pd.qcut(concated_df['loanAmnt'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[1]['annualIncome_bin'] = pd.qcut(concated_df['annualIncome'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
dfs[1]['loanAmnt_bin'] = pd.qcut(concated_df['loanAmnt'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
#interestRate、dti、installment、revolBal、revolUtil分成100份
label_lst = []
bin_number = 100
for i in range(bin_number):
label_lst.append(i)
dfs[0]['interestRate_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['dti_bin'] = pd.qcut(concated_df['dti'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['installment_bin'] = pd.qcut(concated_df['installment'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['revolBal_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[0]['revolUtil_bin'] = pd.qcut(concated_df['revolUtil'], bin_number, labels=label_lst,duplicates='drop')[:dfs[0].shape[0]]
dfs[1]['interestRate_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
dfs[1]['dti_bin'] = pd.qcut(concated_df['dti'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
dfs[1]['installment_bin'] = pd.qcut(concated_df['installment'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
dfs[1]['revolBal_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
dfs[1]['revolUtil_bin'] = pd.qcut(concated_df['revolUtil'], bin_number, labels=label_lst,duplicates='drop')[dfs[0].shape[0]:]
2.2.3 Create logical feature
- Made a crossover between continuous variables including 'loanAmnt', 'installment', 'interestRate', 'annualIncome', 'dti', 'openAcc', 'revolBal', 'revolUtil', 'totalAcc' and discrete variables including 'issueDate','employmentLength','purpose','homeOwnership'.
- Take loanAmnt and employmentLength for example,when employmentLength equals 2, the median loanAmnt is 12000 so that every single 'loanAmnt_employmentLength_ratio' is loanAmnt/12000.
for df in[train_data, test_data]:
for cate in cate_features:
df[cate] = df[cate].fillna(0).astype('int')
issueDate_lst = list(set(concated_df['issueDate']))
ratio_feat_lst = ['loanAmnt', 'installment', 'interestRate', 'annualIncome', 'dti', 'openAcc', \
'revolBal', 'revolUtil', 'totalAcc']
issueDate_lst = list(set(concated_df['issueDate']))
employmentLength_lst = list(set(concated_df['employmentLength']))
purpose_lst = list(set(concated_df['purpose']))
homeOwnership_lst = list(set(concated_df['homeOwnership']))
for feat in ratio_feat_lst:
issueDate_median = {}
issueDate_item_rank = {}
issueDate_label_mean = {}
for dt in issueDate_lst:
# Take the last 6 months
mask = (concated_df['issueDate'] >= dt-3)&(concated_df['issueDate'] <= dt+3)
# Take the last 6 months apart from the current month
mask_1 = (concated_df['issueDate'] >= dt-3)&(concated_df['issueDate'] <= dt+3)&(concated_df['issueDate'] != dt)
item_series = concated_df.loc[mask, feat]
label_series = concated_df.loc[mask_1, 'isDefault']
# Take the median of the last 6 months
issueDate_median[dt] = item_series.median()
issueDate_label_mean[dt] = label_series.mean()
item_rank = item_series.rank()/len(item_series)
issueDate_item_rank[dt] = {}
for item,rank in zip(item_series, item_rank):
issueDate_item_rank[dt][item] = rank
employmentLength_median = {}
for et in employmentLength_lst:
mask = concated_df['employmentLength'] == et
item_series = concated_df.loc[mask, feat]
employmentLength_median[et] = item_series.median()
purpose_median = {}
for pp in purpose_lst:
mask = concated_df['purpose'] == pp
item_series = concated_df.loc[mask, feat]
purpose_median[pp] = item_series.median()
homeOwnership_median = {}
for ho in homeOwnership_lst:
mask = concated_df['homeOwnership'] == ho
item_series = concated_df.loc[mask, feat]
homeOwnership_median[ho] = item_series.median()
for df in [train_data, test_data]:
print(feat, df.shape)
df['label_issueDate_mean'] = df['issueDate'].apply(lambda x: issueDate_label_mean[x])
df[feat+'_issueDate_median'] = df['issueDate'].apply(lambda x: issueDate_median[x])
#df['interestRate_ratio'] = df['interestRate']/df['interestRate_median']
df[feat+'_issueDate_ratio'] = df.fillna(0).apply(lambda r: issueDate_item_rank[r['issueDate']][r[feat]], axis=1)
df[feat+'_employmentLength_ratio'] = df.fillna(0).apply(lambda r: r[feat]/employmentLength_median[r['employmentLength']], axis=1)
df[feat+'_purpose_ratio'] = df.fillna(0).apply(lambda r: r[feat]/purpose_median[r['purpose']], axis=1)
df[feat+'_homeOwnership_ratio'] = df.fillna(0).apply(lambda r: r[feat]/homeOwnership_median[r['homeOwnership']], axis=1)
print(feat, df.shape)

After processing above, created 67 new features.
2.3 Feature Selection
- Look for stability indicators using TOAD's PSI module
feat_lst = list(test_data.columns[1:])
psi_df = toad.metrics.PSI(train_data[feat_lst], test_data[feat_lst]).sort_values(0)
psi_df

- PSI reflects the distribution of validation samples in each fraction segment and the stability of modeling sample distribution. In modeling, we often screen features variables and evaluate model stability. If the model is unstable, it means that the model is not controllable, which is an uncertain risk for the business itself and directly affects the rationality of the decision. This is unacceptable.
- Remove features with PSI greater than 0.25.
feat_lst.remove('installment_homeOwnership_ratio')
feat_lst.remove('installment_purpose_ratio')
feat_lst.remove('revolBal_issueDate_ratio')
feat_lst.remove('revolBal_loanAmnt')
feat_lst.remove('annualIncome_installment')
feat_lst.remove('installment_issueDate_ratio')
feat_lst.remove('installment_employmentLength_ratio')
feat_lst.remove('revolUtil_issueDate_ratio')
feat_lst.remove('revolBal_purpose_ratio')
feat_lst.remove('revolBal_homeOwnership_ratio')
feat_lst.remove('revolBal_employmentLength_ratio')
feat_lst.remove('dti_issueDate_ratio')
3.Modeling
3.1 Model Selection
3.1.1 Import required modules
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
3.1.2 Encapsulation model
def xgb_model(X_train, y_train, X_test, y_test):
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
train_matrix = xgb.DMatrix(X_train_split , label=y_train_split)
valid_matrix = xgb.DMatrix(X_val , label=y_val)
test_matrix = xgb.DMatrix(X_test)
params = {
'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.04,
'tree_method': 'exact',
'seed': 2020,
'n_jobs': -1,
"silent": True,
}
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model = xgb.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
"""Calculate the score on the validation set"""
val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
roc_auc = metrics.auc(fpr, tpr)
print('调参后xgboost单模型在验证集上的AUC:{}'.format(roc_auc))
"""Make a prediction for the test set"""
test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
fpr, tpr, threshold = metrics.roc_curve(y_test,test_pred)
roc_auc1 = metrics.auc(fpr, tpr)
print('调参后xgboost单模型在测试集上的AUC:{}'.format(roc_auc1))
return test_pred
def lgb_model(X_train, y_train, X_test, y_test):
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)
# The optimal parameter after tuning
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.01,
'min_child_weight': 0.32,
'num_leaves': 14,
'max_depth': 4,
'feature_fraction': 0.81,
'bagging_fraction': 0.61,
'bagging_freq': 9,
'min_data_in_leaf': 13,
'min_split_gain': 0.27,
'reg_alpha': 9.58,
'reg_lambda': 4.62,
'seed': 2020,
'n_jobs':-1,
'silent': True,
'verbose': -1,
}
model = lgb.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500, early_stopping_rounds=500)
"""Calculate the score on the validation set"""
val_pred = model.predict(X_val, num_iteration=model.best_iteration)
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
roc_auc = metrics.auc(fpr, tpr)
print('调参后lightgbm单模型在验证集上的AUC:{}'.format(roc_auc))
"""Make a prediction for the test set"""
test_pred = model.predict(X_test, num_iteration=model.best_iteration)
fpr, tpr, threshold = metrics.roc_curve(y_test,test_pred)
roc_auc1 = metrics.auc(fpr, tpr)
print('调参后lightgbm单模型在测试集上的AUC:{}'.format(roc_auc1))
return test_pred
def cat_model(X_train, y_train, X_test, y_test):
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
model = CatBoostClassifier(iterations=2500, cat_features=cate_features,eval_metric='AUC',logging_level='Verbose',
learning_rate=0.05, depth=6, l2_leaf_reg=5, loss_function='CrossEntropy')
model.fit(X_train_split,y_train_split, eval_set=(X_val, y_val), plot=False)
"""Calculate the score on the validation set"""
val_pred = model.predict_proba(X_val)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
roc_auc = metrics.auc(fpr, tpr)
print('调参后lightgbm单模型在验证集上的AUC:{}'.format(roc_auc))
"""Make a prediction for the test set"""
test_pred = model.predict_proba(X_test)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test,test_pred)
roc_auc1 = metrics.auc(fpr, tpr)
print('调参后lightgbm单模型在测试集上的AUC:{}'.format(roc_auc1))
return test_pred
3.1.3 Comparison of single model scores
3.1.3.1 XGBoost
xgb_pred=xgb_model(Xtrain,Ytrain,Xtest, Ytest)

3.1.3.2 LightGBM
lgb_pred=lgb_model(Xtrain,Ytrain,Xtest, Ytest)

3.1.3.3 Catboost
cat_pred=cat_model(Xtrain,Ytrain,Xtest, Ytest)

Catboost with a score of 0.74+ was finally selected for modeling.
3.2 Adjust the parameters
from sklearn.model_selection import GridSearchCV
params = {'depth': [2,5,8],
'learning_rate' : [0.05,0.1,0.15],
'l2_leaf_reg': [2,5,8],
'iterations': [10000],
'early_stopping_rounds':[300],
'loss_function':['CrossEntropy','Logloss']
}
cb_estimator=CatBoostClassifier(cat_features=cate_features1,eval_metric='AUC',logging_level='Verbose')
X_train, X_validation, y_train, y_validation = train_test_split(train_data.loc[:, feat_lst],
train_data.loc[:, 'isDefault'], test_size=0.125,random_state=2)
cb_model = GridSearchCV(cb_estimator, param_grid = params, scoring="roc_auc", cv = 2)
cb_model.fit(X_train,y_train,eval_set=(X_validation,y_validation))
cb_model.best_params_

Next, train the model according to the best parameters.
3.3 Training model
model_lst = []
#Discrete features variables
cate_features = ['employmentTitle', 'employmentLength_bin', 'purpose', 'postCode', 'subGrade', 'earliesCreditLine_bin', \
'regionCode', 'title', 'issueDate_bin', 'term_bin',\
'interestRate_bin', 'annualIncome_bin', 'loanAmnt_bin','homeOwnership_bin',\
'revolBal_bin','dti_bin','installment_bin','revolBal_bin','revolUtil_bin']
#
pred_data=pd.DataFrame()
for i in range(3):
X_train, X_validation, y_train, y_validation = train_test_split(train_data.loc[:, feat_lst],
train_data.loc[:, 'isDefault'],
test_size=0.125 , random_state=i*1000)
model = CatBoostClassifier(iterations=10000, cat_features=cate_features,eval_metric='AUC',logging_level='Verbose',
learning_rate=0.1, depth=6, l2_leaf_reg=5, loss_function='CrossEntropy',early_stopping_rounds=500)
print(X_train.loc[:, feat_lst].shape,
y_train.shape,
X_validation.loc[:, feat_lst].shape,
y_validation.shape)
model.fit(X_train.loc[:, feat_lst],y_train, eval_set=(X_validation.loc[:, feat_lst], y_validation), plot=False)
preds = model.predict_proba(test_data[feat_lst])[:, 1]
pred_data[i]=preds
pred_data

Give proper weight to the predicted results
total_score=(0.7494391347+0.7497655984+0.7485861886)-0.74*3
first_weight=(0.7494391347-0.74)/total_score
second_weight=(0.7497655984-0.74)/total_score
third_weight=(0.7485861886-0.74)/total_score
pred_data['weight']=pred_data[0]*first_weight+pred_data[1]*second_weight+pred_data[2]*third_weight
pred_data

Submission
submit=pd.DataFrame(np.arange(800000,1000000,1),columns=['id'])
submit['isDefault']=pred_data['weight']
submit[['id','isDefault']].to_csv('submit.csv', index=False)
- In the end, ranked 43 out of 9,611 participants, ranking in the top 0.5%.
-
There is still a lot of room for improvement due to inadequate computer performance and limited time, such as lightly adjusting model parameters;creating more features;more model stacking.
image.png


