import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_scv("creditcard.csv")
data.head()
help(pd.value_counts) # 计算非空值计数的直方图
value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True) Compute a histogram of the counts of non-null values.
count_classes = pd.value_counts(data['Class'], sort = True).sort_index()
# obj.sort_index() 按行索引进行排序
count_classes.plot(kind = 'bar')
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
针对不平衡样本处理方法:过采样和欠采样
X=data.loc[:, data.columns != 'Class'] # 将不是class列的数据赋给X
y = data.loc[:, data.columns == 'Class']
# 计算交易异常的个数
number_records_fraud = len(data[data.Class == 1])
# data.Class ==1 返回的是布尔值,data[True]是返回Class列为1的整行数据
fraud_indices = np.array(data[data.Class == 1 ].index)
# 将Class为1的数据的索引生成一个数组。
normal_indices = data[data.Class == 0].index
# 将Class = 0 的数据的索引赋给normal_indices
欠采样
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
# 随机选择(从Class = 0 的索引里选择,数量和class = 1 的一样多,是否替换样本为否)
random_normal_indices = np.array(random_normal_indices)
# Appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
## 将两个索引数组拼接起来
# Under sample dataset
under_sample_data = data.iloc[under_sample_indices,:]
# 通过索引将每一行数据赋给欠采样数据集
X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'Class']
# 将欠采样数据集的X,y分开
# Showing ratio 计算正常交易和异常交易的所占的比例
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
loc函数完全基于标签位置的索引器,所谓标签,位置就是定义的行标题
iloc函数完全基于行号的索引器,所谓的行号就是第0,1,2行
from sklearn.model_selection import train_test_split
#从sklearn库调特征选择将样本分成为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
# 测试集占总样本的0.3, 计算测试的个数。
# whole dataset
print("Number transactions train dataset:", len(X_train))
print("Number transactions test dataset:", len(X_test))
print("Total number of transactions:", len(X_train) + len(X_test))
# undersampled dataset
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample ,y_undersample,test_size = 0.3 ,random_state = 0)
print("")
print("Number transactions train dataset:", len(X_train_undersample))
print("Number transactions test dataset:", len(X_train_undersample))
print("Total number of transactions:", len(X_train_undersample) + len(X_test_undersample))
几个评价指标
以FPR为横坐标,TPR为纵坐标,那么ROC曲线就是改变各种阈值后得到的所有坐标点 (FPR,TPR) 的连线,画出来如下。红线是随机乱猜情况下的ROC,曲线越靠左上角,分类器越佳。
AUC被定义为ROC曲线下的面积
在比较不同的分类模型时,可以将每个模型的ROC曲线都画出来,比较曲线下面积做为模型优劣的指标。
ROC曲线下方的面积(英语:Area under the Curve of ROC (AUC ROC)),其意义是:
因为是在1x1的方格里求面积,AUC必在0~1之间。
假设阈值以上是阳性,以下是阴性;
若随机抽取一个阳性样本和一个阴性样本,分类器正确判断阳性样本的值高于阴性样本之机率
简单说:AUC值越大的分类器,正确率越高。
#Recall = TP/(TP+FN)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, recall_score, classification_report
# 分类报告sklearn中的classification_report函数用于显示主要分类指标的文本报告.在报告中显示每个类的精确度,召回率,F1值等信息。
# 主要参数:
# y_true:1维数组,或标签指示器数组/稀疏矩阵,目标值。
# y_pred:1维数组,或标签指示器数组/稀疏矩阵,分类器返回的估计值。
# labels:array,shape = [n_labels],报表中包含的标签索引的可选列表。
# target_names:字符串列表,与标签匹配的可选显示名称(相同顺序)。
# sample_weight:类似于shape = [n_samples]的数组,可选项,样本权重。
# digits:int,输出浮点值的位数.
def printing_Kfold_scores(x_train_data, y_train_data):
fold = KFold(5, shuffle = False) #进行k折交叉验证,是否在将数据分成批之前进行洗牌为否
# 不同的罚参数
c_param_range = [0.01, 0.1, 1, 10, 100]
results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
# 形成DataFrame结构表 索引为五行两列,列名为C_parameter , Mean recall score
results_table['C_parameter'] = c_param_range # 将罚参数赋给C_parameter这一列
# the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]
j = 0
for c_param in c_param_range:
print('-------------------------------------------')
print('C parameter: ', c_param)
print('-------------------------------------------')
print('')
recall_accs = [] # 召回率
for iteration, indices in enumerate(fold.split(x_train_data)):
# Call the logistic regression model with a certain C parameter
lr = LogisticRegression(C = c_param, penalty = 'l1') # 调用逻辑回归,L1型罚函数
# 利用训练数据拟合模型。在这种情况下,我们使用折叠部分来训练模型
# with indices[0]. We then predict on the portion assigned as the 'test cross validation' with indices[1]
lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
# rvavel(散开,解开),flatten(变平)。两者的区别在于返回拷贝(copy)还是返回视图(view),
#numpy.flatten()返回一份拷贝,对拷贝所做的修改不会影响(reflects)原始矩阵,
# 而numpy.ravel()返回的是视图,会影响(reflects)原始矩阵。
# 使用训练数据中的测试指标预测值
y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)
# 计算召回率,并将其追加到表示当前c_parameter的召回率列表中
recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
recall_accs.append(recall_acc)
print('Iteration ', iteration,': recall score = ', recall_acc)
# The mean value of those recall scores is the metric we want to save and get hold of.
results_table.loc[j,'Mean recall score'] = np.mean(recall_accs)
j += 1
print('')
print('Mean recall score ', np.mean(recall_accs))
print('')
best_c=results_table.loc[results_table['Mean recall score'].astype('float64').idxmax()]['C_parameter']
# 将召回率最大的罚参数输出
# 最后,我们可以检查选择的C参数中哪个是最好的。
print('*********************************************************************************')
print('Best model to choose from cross validation is with C parameter = ', best_c)
print('*********************************************************************************')
return best_c
enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,
一般用在 for 循环当中。
import warnings
warnings.filterwarnings("ignore") # 忽略警告
best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample) # 将欠采样数据集传入进去
def plot_confusion_matrix(cm, classes, title = "Confusion matrix", camp = plt.cm.Blues): # 混淆矩阵
plt.imshow(cm,interpolation = 'nearest', cmap = cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation = 0)
plt.yticks(tick_marks, classes)
thresh = cm.max()/2
for i, j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
plt.text(j, i, cm[i, j], horizontalignment = "center", color = "white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel("Predicted label")
import itertools
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)
# 计算混淆矩阵
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# 绘制非标准化混淆矩阵
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
best_c = printing_Kfold_scores(X_train,y_train)
C parameter: 0.01
Iteration 0 : recall score = 0.4925373134328358
Iteration 1 : recall score = 0.6027397260273972
Iteration 2 : recall score = 0.6833333333333333
Iteration 3 : recall score = 0.5692307692307692
Iteration 4 : recall score = 0.45
Mean recall score 0.5595682284048672
C parameter: 0.1
Iteration 0 : recall score = 0.5671641791044776
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.6833333333333333
Iteration 3 : recall score = 0.5846153846153846
Iteration 4 : recall score = 0.525
Mean recall score 0.5953102506435158
C parameter: 1
Iteration 0 : recall score = 0.5522388059701493
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.7166666666666667
Iteration 3 : recall score = 0.6153846153846154
Iteration 4 : recall score = 0.5625
Mean recall score 0.612645688837163
C parameter: 10
Iteration 0 : recall score = 0.5522388059701493
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.7333333333333333
Iteration 3 : recall score = 0.6153846153846154
Iteration 4 : recall score = 0.575
Mean recall score 0.6184790221704963
C parameter: 100
Iteration 0 : recall score = 0.5522388059701493
Iteration 1 : recall score = 0.6164383561643836
Iteration 2 : recall score = 0.7333333333333333
Iteration 3 : recall score = 0.6153846153846154
Iteration 4 : recall score = 0.575
Mean recall score 0.6184790221704963
Best model to choose from cross validation is with C parameter = 10.0
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train,y_train.values.ravel())
y_pred_undersample = lr.predict(X_test.values)
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred_undersample)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Confusion matrix')
plt.show()
Recall metric in the testing dataset: 0.6190476190476191
lr = LogisticRegression(C = 0.01, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample_proba = lr.predict_proba(X_test_undersample.values)
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
plt.figure(figsize=(10,10))
j = 1
for i in thresholds:
y_test_predictions_high_recall = y_pred_undersample_proba[:,1] > i
plt.subplot(3,3,j)
j += 1
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test_undersample,y_test_predictions_high_recall)
np.set_printoptions(precision=2)
print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plot_confusion_matrix(cnf_matrix
, classes=class_names
, title='Threshold >= %s'%i)
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 1.0
Recall metric in the testing dataset: 0.9727891156462585
Recall metric in the testing dataset: 0.9387755102040817
Recall metric in the testing dataset: 0.8979591836734694
Recall metric in the testing dataset: 0.8367346938775511
Recall metric in the testing dataset: 0.782312925170068
Recall metric in the testing dataset: 0.5986394557823129