回归预测模板

一,EDA

1,观察目标变量

sns.set_style("white")

f, ax = plt.subplots(figsize=(8, 7))

#Check the new distribution

sns.distplot(train['SalePrice'], color="b");

ax.xaxis.grid(False)

ax.set(ylabel="Frequency")

ax.set(xlabel="SalePrice")

ax.set(title="SalePrice distribution")

sns.despine(trim=True, left=True) #去除框

plt.show()

2,求与目标变量的相关系数

corr_with_SalePrice = train.corr()["SalePrice"].sort_values(ascending=False)

plt.figure(figsize=(20,6))

corr_with_SalePrice.drop("SalePrice").plot.bar()

plt.show()

3,箱线图(离散)观察异常值,一般为与目标变量相关度较高的变量

data=pd.concat([train['SalePrice'],train['OverallQual']],axis=1)

f,ax=plt.subplots(figsize(8,6))

fig=sns.boxplot(x=train['OverallQual'],y="SalePrice",data=data)

fig.axis(ymin=0,ymax=800000);

连续值观察异常值

data=pd.concat([train['SalePrice'],train['TotalBsmtSF']],axis=1)

data.plot.scatter(x='TotalBsmtSF',y='SalePrice',alpha=0.3,ylim=(0,800000))

二,特征处理

1,使目标变量趋于正态分布

train["SalePrice"]=np.log1p(train["SalePrice"])

获取正太分布后的均值和标准差

from scipy.stats import skew, norm

(mu, sigma) = norm.fit(train['SalePrice'])

print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

2,去除异常值
train.drop(train[(train['OverallQual']<5)&(train['SalePrice']>200000)].index,inplace=True)

train.drop(train[(train['GrLivArea']>4500)&(train['SalePrice']<300000)].index,inplace=True)

3,缺失值

def percent_missing(df):

    data = pd.DataFrame(df)

    df_cols = list(pd.DataFrame(data))

    dict_x = {}

    for i in range(0, len(df_cols)):

        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})

    return dict_x

missing = percent_missing(all_features)

df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)

4,处理偏度较大的变量

skew_features = all_features[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]

skew_index = high_skew.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))

skewness = pd.DataFrame({'Skew' :high_skew})

skew_features.head(10)


from scipy.special import boxcox1p

for i in skew_index:

    all_features[i] = boxcox1p(all_features[i], boxcox_normmax(all_features[i] + 1))

5,创建新的特征

比如组合特征,log特征,平方特征

6,非数值特征转码

all_features = pd.get_dummies(all_features).reset_index(drop=True)

三,模型

kf = KFold(n_splits=12, random_state=42, shuffle=True)

def cv_rmse(model, X=X):

    rmse = np.sqrt(-cross_val_score(model, X, train_labels, scoring="neg_mean_squared_error", cv=kf))

    return (rmse)

建立多个模型,进行模型融合

from mlxtend.regressor import StackingCVRegressor

stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, rf),

                                meta_regressor=xgboost,

                                use_features_in_secondary=True)

def blended_predictions(X):

    return ((0.1 * ridge_model_full_data.predict(X)) + \

            (0.2 * svr_model_full_data.predict(X)) + \

            (0.1 * gbr_model_full_data.predict(X)) + \

            (0.1 * xgb_model_full_data.predict(X)) + \

            (0.1 * lgb_model_full_data.predict(X)) + \

            (0.05 * rf_model_full_data.predict(X)) + \

            (0.35 * stack_gen_model.predict(np.array(X))))

最后输出结果记得np.expm1



最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。