import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',header=None)
df.head();df.dtypes
df['ca'].unique() #输出所有的种类
#array(['0.0', '3.0', '2.0', '1.0', '?'], dtype=object)
df['thal'].unique() #输出所有的种类
#array(['6.0', '3.0', '7.0', '?'], dtype=object)
len(df.loc[(df['ca']=='?') |(df['thal']=='?')]) #查看这两列有多少列
#6
df.loc[(df['ca']=='?') |(df['thal']=='?')] #查看其中的?
len(df_np_missing)
297
df_np_missing['ca'].unique()
#array(['0.0', '3.0', '2.0', '1.0'], dtype=object)
df_np_missing['slope'].unique()
#array([3., 2., 1.])
X=df_np_missing.drop('hd',axis=1).copy()
X.head()
y=df_np_missing['hd'].copy()
y.head()
X['cp'].unique()
pd.get_dummies(X,columns=['cp']).head() #类似于转换为因子
X_encoded=pd.get_dummies(X,columns=['cp','restecg','slope','thal']);X_encoded.head()
y.unique()
y_not_zero_index= y > 0
y[y_not_zero_index]=1
y.unique()
array([0, 1], dtype=int64)
y_not_zero_index.head()
X_train,x_test,y_train,y_test=train_test_split(X_encoded,y,random_state=42)
clf_dt=DecisionTreeClassifier(random_state=42)
clf_dt=clf_dt.fit(X_train,y_train)
clf_dt
#DecisionTreeClassifier(random_state=42)
plt.figure(figsize=(15,7.5))
plot_tree(clf_dt,filled=True,rounded=True,class_names=['No HD',"Yes HD"],feature_names=X_encoded.columns);
#注意末尾这个;
plot_confusion_matrix(clf_dt,x_test,y_test,display_labels=['Does not have HD',"Has HD"])
train_scores=[clf_dt.score(X_train,y_train)for clf_dt in clf_dts]
test_scores=[clf_dt.score(x_test,y_test) for clf_dt in clf_dts]
train_scores
train_scores=[clf_dt.score(X_train,y_train) for clf_dt in clf_dts]
test_scores=[clf_dt.score(x_test,y_test) for clf_dt in clf_dts]
fig,ax=plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel('accuracy')
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas,train_scores,marker='o',label='train',drawstyle='steps-post')
ax.plot(ccp_alphas,test_scores,marker='o',label='test',drawstyle='steps-post')
ax.legend()
plt.show()
clf_dt=DecisionTreeClassifier(random_state=42,ccp_alpha=0.016)
scores=cross_val_score(clf_dt,X_train,y_train,cv=5)
df=pd.DataFrame(data={'tree':range(5),"accuracy":scores})
df.plot(x='tree',y='accuracy',marker='o',linestyle='--')
alpha_loop_values=[]
for ccp_alpha in ccp_alphas:
clf_dt=DecisionTreeClassifier(random_state=0,ccp_alpha=ccp_alpha)
scores=cross_val_score(clf_dt,X_train,y_train,cv=5)
alpha_loop_values.append([ccp_alpha,np.mean(scores),np.std(scores)])
alpha_results=pd.DataFrame(alpha_loop_values,columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',y='mean_accuracy',yerr='std',marker='o',linestyle='--')
alpha_results[(alpha_results['alpha']>0.014)&(alpha_results['alpha']<0.015)]
ideal_ccp_alpha=alpha_results[(alpha_results['alpha']>0.014)&(alpha_results['alpha']<0.015)]['alpha']
ideal_ccp_alpha
ideal_ccp_alpha=float(ideal_ccp_alpha)
ideal_ccp_alpha
clf_dt_pruned=DecisionTreeClassifier(random_state=42,ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned=clf_dt_pruned.fit(X_train,y_train)
plot_confusion_matrix(clf_dt_pruned,x_test,y_test,display_labels=["Does not have HD","Has HD"])
plt.figure(figsize=(15,7.5))
plot_tree(clf_dt_pruned,filled=True,rounded=True,class_names=['No HD',"Yes HD"],
feature_names=X_encoded.columns);