解决样本不平衡问题
from imblearn.over_sampling import RandomOverSampler
机器学习实战编码技巧
一堆头文件:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler #过采样
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler #数据预处理
#模型选择
from sklearn.model_selection import cross_val_score #交叉验证的方法
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, KFold, GridSearchCV
#模型评估结果
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import cohen_kappa_score #用于一致性检验的指标,也可以用于衡量分类的效果。
# 模型调参
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
#机器学习模块
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier #集成学习模块
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
import missingno as msno #是一个可视化缺失值的库
from scipy.stats import randint #随机数
from catboost import CatBoostClassifier #机器学习库
import xgboost as xgb
from xgboost import XGBClassifier, plot_importance
import lightgbm as lgb #基于决策树的提升算法
import pickle #模型打包
SVD截断奇异值分解
a = pd.get_dummies(X['Region_Code'], prefix = 'Region_Code', drop_first=True)
svd=TruncatedSVD(n_components=5, n_iter=7, random_state=42)
svd.fit(a)
data1 = pd.DataFrame(svd.transform(a))
data1.columns = ['Region_Code_'+str(i) for i in range(5)]
对样本进行下采样:
rus = RandomUnderSampler(random_state=0)
X, y = rus.fit_resample(X, y)
- 召回率:样本中的正例有多少被预测正确了
交叉验证:
kf = model_selection.KFold(n_splits=10, random_state=None, shuffle=False)
scores = model_selection.cross_val_score(model, X_train, y_train, cv=kf)
主成分分析:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
X = raw_nonu[['Vehicle_Age_1-2 Year','Vehicle_Age_< 1 Year','Vehicle_Age_> 2 Years']]
pca=PCA(n_components=1) #主成分个数
pca.fit(X)
pca_Vehicile = pca.transform(X)
正态性检验
print("Vintage",stats.shapiro(X_train.Vintage))
计算得分:
accuracy_score(y_test, predictions)
precision_score(y_test, predictions)
recall_score(y_test, predictions)
f1_score(y_test, predictions)
混淆矩阵:
plot_confusion_matrix(tree_classifier,X_test, y_test) #(分类器,特征,标签)
数据标准化/归一化:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
ss = StandardScaler()
train[num_feat] = ss.fit_transform(train[num_feat])
mm = MinMaxScaler()
train[['Annual_Premium']] = mm.fit_transform(train[['Annual_Premium']])
matlab化:
%pylab inline
画ROC曲线:
%pylab inline
y_score = rf_load.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)
title('Random Forest ROC curve: CC Fraud')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')
plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
roc_auc_score(y_test, y_score)
plt.savefig("./ROC曲线")
划分训练集测试集:
from sklearn.model_selection import train_test_split
train_target=train['Response']
x_train,x_test,y_train,y_test = train_test_split(train,train_target, random_state = 0)
决策树:
tree_classifier = DecisionTreeClassifier()
xgboost
model_xgb = XGBClassifier()
model_xgb.fit(X, y,eval_metric='mlogloss')
随机森林:
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
random_search = {'criterion': ['entropy', 'gini'],
'max_depth': [2,3,4,5,6,7,10],
'min_samples_leaf': [4, 6, 8],
'min_samples_split': [5, 7,10],
'n_estimators': [300]}
clf = RandomForestClassifier()
model = RandomizedSearchCV(estimator = clf, param_distributions = random_search, n_iter = 10,
cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
KNN:
KNN = KNeighborsClassifier(n_neighbors=11, metric='minkowski', p = 2)
KNN.fit(X_train, y_train)
KNN_predictions = KNN.predict(X_test)
KNN_predictions
BaggingClassifier
b_classifier = BaggingClassifier()
b_classifier.fit(X_train, y_train)
b_predictions = b_classifier.predict(X_test)
b_predictions
构造一个等比数列:
range_m = np.logspace(0, 2, num=5).astype(int)
独热编码:
var=pd.get_dummies(var, prefix = 'var', drop_first=True) #默认扔掉第一个哑变量,还会把原来那个也删除掉。
categorical_vars = ['Gender', 'Vehicle_Age', 'Vehicle_Damage','Region_Code']
#列出你要编码的字段
for var in categorical_vars:
data = pd.concat([data, pd.get_dummies(data[var], prefix = var)], 1)
data = data.drop(var, 1) #删除原字段
保存模型:
import pickle
filename = 'rf_model.sav'
pickle.dump(model, open(filename, 'wb'))
#导入模型
rf_load = pickle.load(open(filename, 'rb'))