基于shapley值的机器学习可解释性分析
shapley值:当多人联盟博弈时,某人加入组织,对最终博弈决策带来的边际贡献;某一个特征引入时,对模型预测结果带来的边际影响(特征重要度)
在机器学习中,shapley值反映特定样本的特征重要度
SHAP:SHapley Additive explanation (SHapley Additive explanation)是一种解释任何机器学习模型输出的博弈论方法
pip install shap
or
conda install -c conda-forge shap
LIME可解释性分析
pip install lime scikit-learn numpy pandas matplotlib pillow
import os
# 存放测试图片
os.mkdir('test_img')
# 存放模型权重文件
os.mkdir('checkpoint')
# 下载样例模型文件
!wget https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/checkpoints/fruit30_pytorch_20220814.pth -P checkpoint
# 下载 类别名称 和 ID索引号 的映射字典
!wget https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/dataset/fruit30/labels_to_idx.npy
!wget https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/dataset/fruit30/idx_to_labels.npy
!wget https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/test/cat_dog.jpg -P test_img
!wget https://zihao-openmmlab.obs.myhuaweicloud.com/20220716-mmclassification/test/0818/test_fruits.jpg -P test_img
!wget https://zihao-openmmlab.obs.myhuaweicloud.com/20220716-mmclassification/test/0818/test_orange_2.jpg -P test_img
!wget https://zihao-openmmlab.obs.myhuaweicloud.com/20220716-mmclassification/test/0818/test_bananan.jpg -P test_img
!wget https://zihao-openmmlab.obs.myhuaweicloud.com/20220716-mmclassification/test/0818/test_kiwi.jpg -P test_img
# 草莓图像,来源:https://www.pexels.com/zh-cn/photo/4828489/
!wget https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/test/0818/test_草莓.jpg -P test_img
!wget https://zihao-openmmlab.obs.myhuaweicloud.com/20220716-mmclassification/test/0818/test_石榴.jpg -P test_img
!wget https://zihao-openmmlab.obs.myhuaweicloud.com/20220716-mmclassification/test/0818/test_orange.jpg -P test_img
!wget https://zihao-openmmlab.obs.myhuaweicloud.com/20220716-mmclassification/test/0818/test_lemon.jpg -P test_img
!wget https://zihao-openmmlab.obs.myhuaweicloud.com/20220716-mmclassification/test/0818/test_火龙果.jpg -P test_img
!wget https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/test/watermelon1.jpg -P test_img
!wget https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/test/banana1.jpg -P test_img
import lime
import sklearn
import numpy as np
import pandas as pd
import lime
from lime import lime_tabular
#载入数据集
df = pd.read_csv('wine.csv')
df.shape
df
#划分训练集和测试集
from sklearn.model_selection import train_test_split
X = df.drop('quality', axis=1)
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape
X_test.shape
y_train.shape
y_test.shape
#训练模型
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
#评估模型
score = model.score(X_test, y_test)
score
explainer = lime_tabular.LimeTabularExplainer(
training_data=np.array(X_train), # 训练集特征,必须是 numpy 的 Array
feature_names=X_train.columns, # 特征列名
class_names=['bad', 'good'], # 预测类别名称
mode='classification' # 分类模式
)
idx = 3
data_test = np.array(X_test.iloc[idx]).reshape(1, -1)
prediction = model.predict(data_test)[0]
y_true = np.array(y_test)[idx]
print('测试集中的 {} 号样本, 模型预测为 {}, 真实类别为 {}'.format(idx, prediction, y_true))
exp = explainer.explain_instance(
data_row=X_test.iloc[idx],
predict_fn=model.predict_proba
)
exp.show_in_notebook(show_table=True)
对Pytorch的ImageNet预训练图像分类模型,运行LIME可解释性分析:可视化某个输入图像,某个图块区域,对模型预测为某个类别的贡献影响
import matplotlib.pyplot as plt
from PIL import Image
import torch.nn as nn
import numpy as np
import os, json
import torch
from torchvision import models, transforms
from torch.autograd import Variable
import torch.nn.functional as F
# 有 GPU 就用 GPU,没有就用 CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device', device)
#载入测试图片
img_path = 'test_img/cat_dog.jpg'
img_pil=Image.open(img_path)
img_pil
#载入模型
model = models.inception_v3(pretrained=True).eval().to(device)
idx2label, cls2label, cls2idx = [], {}, {}
with open(os.path.abspath('imagenet_class_index.json'), 'r') as read_file:
class_idx = json.load(read_file)
idx2label = [class_idx[str(k)][1] for k in range(len(class_idx))]
cls2label = {class_idx[str(k)][0]: class_idx[str(k)][1] for k in range(len(class_idx))}
cls2idx = {class_idx[str(k)][0]: k for k in range(len(class_idx))}
#图像预处理
trans_norm = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
trans_A = transforms.Compose([
transforms.Resize((256, 256)),
transforms.CenterCrop(224),
transforms.ToTensor(),
trans_norm
])
trans_B = transforms.Compose([
transforms.ToTensor(),
trans_norm
])
trans_C = transforms.Compose([
transforms.Resize((256, 256)),
transforms.CenterCrop(224)
])
#图像分类预测
input_tensor = trans_A(img_pil).unsqueeze(0).to(device)
pred_logits = model(input_tensor)
pred_softmax = F.softmax(pred_logits, dim=1)
top_n = pred_softmax.topk(5)
top_n
#定义分类预测函数
def batch_predict(images):
batch = torch.stack(tuple(trans_B(i) for i in images), dim=0)
batch = batch.to(device)
logits = model(batch)
probs = F.softmax(logits, dim=1)
return probs.detach().cpu().numpy()
test_pred = batch_predict([trans_C(img_pil)])
test_pred.squeeze().argmax()
#可解释性分析
from lime import lime_image
explainer = lime_image.LimeImageExplainer()
explanation = explainer.explain_instance(np.array(trans_C(img_pil)),
batch_predict, # 分类预测函数
top_labels=5,
hide_color=0,
num_samples=8000) # LIME生成的邻域图像个数
explanation.top_labels[0]
#可视化
from skimage.segmentation import mark_boundaries
temp, mask = explanation.get_image_and_mask(explanation.top_labels[0], positive_only=False, num_features=20, hide_rest=False)
img_boundry = mark_boundaries(temp/255.0, mask)
plt.imshow(img_boundry)
plt.show()
#修改可视化参数
temp, mask = explanation.get_image_and_mask(281, positive_only=False, num_features=20, hide_rest=False)
img_boundry = mark_boundaries(temp/255.0, mask)
plt.imshow(img_boundry)
plt.show()