分类算法与数据挖掘

################分类算法与数据挖掘---也就是回归于分类算法--对应于Y的0/1算法

####分类分析---原因,想看看这个人的个人信用,预测会不会还钱,考虑贷不贷给他钱0/1;

####依据一些别的特征,预测是不是有肿瘤0/1  类似logistic回归y=0/1---不过数据分成训练集和验证集

#####包-rpart、rpart.plot、party

#####对这些y=0/1分类变量的方法:逻辑回归、决策树、随机森林、支持向量机、神经网络、贝叶斯、K近邻。

#数据准备--- UCI机器学习数据库

loc<-"https://archive.ics.uci.edu/ml/machine-learning-databases/"

ds<-"breast-cancer-wisconsin/breast-cancer-wisconsin.data"

url<-paste(loc,ds,sep = "")

loc

ds

url

breast<-read.table(url,sep = ",",header = F,na.strings = "?")

names(breast)<-c("ID","clumpThickness","sizeUniformity","shapeUniformity","maginalAdhesion","singleEpithelialCellSize",

                "bareNuclei","blandChromatin","normalNucleoli","mitosis","class")

df<-breast[-1]

df$class<-factor(df$class,levels = c(2,4),labels = c("benign","malignant"))

##set.seed()保证操作的可重复性,别人也用1234,产生的随机数就和你的一样了

set.seed(1234)

##选取训练集--从nrow(df)中即699个数字中,无放回(如果放回,replace=T)抽取0.7*nrow(df)个数字

train<-sample(nrow(df),0.7*nrow(df))

##提取出训练集的列表

df.train<-df[train,]

###提取验证集的列表

df.validate<-df[-train,]

  ##看看各个列表的数目

table(df.train$class)

table(df.validate$class)

###方法一: 逻辑回归

  ###abcdef  glm(a~.)用.可以表示除了a之外的所有

##拟合:

fit_logit<-glm(class~.,data = df.train,family = binomial())

summary(fit_logit)

##用模拟的数据在新的数据集中,进行数据的验证: type = "response" 直接返回预测的概率值0~1之间

prob<-predict(fit_logit,df.validate,type="response")

prob

#3将概率大于0.5的定义为恶性肿瘤;概率小于0.5的定义为良性肿瘤,

logit.pred<-factor(prob>.5,levels = c(FALSE,TRUE),labels = c("benign","malignant"))

logit.pred

##3得出实际与预测的交叉表

logit.perf<-table(df.validate$class,logit.pred,dnn=c("Actual","Predicted"))

###预测出118个良,76个恶性

####  准确率为(76+118)/(129+81)=0.92 (76+118)/200=0.97

(76+118)/(129+81) ###有问题

(76+118)/200

###再回归来看 有几个模拟概率>0.05,不满足,可以删除再模拟,也可以用下面的方法

logit.fit.reduced<-step(fit_logit)

###决策树:两类-经典树,条件推断树

  ####经典树----rpart包做rpart()决策树;prune()做树枝的剪枝,得到预测误差最小的树枝

library(rpart)

set.seed(1234)

##生成树

dtree<-rpart(class~.,data = df.train,method="class",parms = list(split="information"))

dtree$cptable

plotcp(dtree)

###剪枝  依据dtree$cptable得到的cp复杂度参数=0.0125

dtree.pruned<-prune(dtree,cp=0.0125)

##3作图 prp做出最终的图

library(rpart.plot)

prp(dtree.pruned,type=2,extra = 104,fallen.leaves =TRUE,main="Decision Tree" )

##对验证集进行分类验证

dtree.pred<-predict(dtree.pruned,df.validate,type="class")

dtree.perf<-table(df.validate$class,dtree.pred,dnn = c("Actual","Predicted"))

dtree.perf

(122+79)/200

##3条件推断树 依据显著性分类;对比与经典的的包含为Y为其中一类的纯度分类划分 party包

library(party)

fit_ctree<-ctree(class~.,data = df.train)

plot(fit_ctree,main="Conditional Inference Tree")

ctree.pred<-predict(fit_ctree,df.validate,type="response")

ctree.perf<-table(df.validate$class,ctree.pred,dnn = c("Actual","Predicted"))

ctree.perf

###随机森林法:决策树预测类别众数纪委随机森林预测的样本单元的类别

  ###sandomForest包 randomForest()  默认生成500棵树

library(randomForest)

set.seed(1234)

###na.action=na.roughfix  有缺省值用对应列的中位数代替

fit.forest<-randomForest(class~.,df.train,na.action=na.roughfix,importance=TRUE)

fit.forest

###给出变量重要性  看看哪个变量重要

importance(fit.forest,type = 2)

##队验证集进行验证

forest.pred<-predict(fit.forest,df.validate)

forest.perf<-table(df.validate$class,forest.pred,dnn = c("Actual","Predicted"))

forest.perf

##支持向量机SVM  投影  x,y->  x2,2k开方*xy,y2 ->变为三维数据 Z1,Z2,Z3

### kernlab包ksvm函数-强大  或者e1071包中的svm()函数 简单

library(e1071)

set.seed(1234)

fit.svm<-svm(class~.,df.train)

fit.svm

svm.predit<-predict(fit.svm,na.omit(df.validate))

svm.perf<-table(na.omit(df.validate)$class,svm.predit,dnn = c("Actual","Predicted"))

svm.perf

#####选择调和参数  改进支持向量机参数

set.seed(1234)

tuned<-tune.svm(class~.,data=df.train,gamma = 10^(-6:1),cost = 10^(-10:10))

#3得到gamma=0.01,cost=1模拟最好

tuned

fit.tuned.svm<-svm(class~.,data = df.train,gamma=0.01,cost=1)

svm.tuned.pred<-predict(fit.tuned.svm,na.omit(df.validate))

svm.tuned.perf<-table(na.omit(df.validate)$class,svm.tuned.pred,dnn = c("Actual","Predicted"))

svm.tuned.perf

##############################################

###选择预测最好的解---评价二分类准确性

performance<-function(table,n=2){

  if(!all(dim(table)==c(2,2)))

    stop("Must be a 2x2 table")

  tn=table[1,1]

  fp=table[1,2]

  fn=table[2,1]

  tp=table[2,2]

  sensitivity=tp/(tp+fn)

  specificity=tn/(tn+fp)

  ppp=tp/(tp+fp)

  npp=tn/(tn+fn)

  hitrate=(tp+tn)/(tp+tn+fp+fn)

  result<-paste("Sensitivity=",round(sensitivity,n),

                "\nSpecificity=",round(specificity,n),

                "\nPositive Predictive Value=",round(ppp,n),

                "\nNegative Predictive Value=",round(npp,n),

                "\nAccuracy=",round(hitrate,n),"\n",sep="")

  cat(result)

}

performance(logit.perf) 

performance(dtree.perf)

performance(ctree.perf)

performance(forest.perf)

performance(svm.perf)

performance(svm.tuned.perf)

################################################

###rattle包进行数据挖掘 library(rattle)-> rattle()进入GUI图形交互界面

#数据准备 

loc<-"https://archive.ics.uci.edu/ml/machine-learning-databases/"

ds<-"breast-cancer-wisconsin/breast-cancer-wisconsin.data"

url<-paste(loc,ds,sep = "")

breast<-read.table(url,sep = ",",header = F,na.strings = "?")

names(breast)<-c("ID","clumpThickness","sizeUniformity","shapeUniformity","maginalAdhesion","singleEpithelialCellSize",

                "bareNuclei","blandChromatin","normalNucleoli","mitosis","class")

df<-breast[-1]

df$class<-factor(df$class,levels = c(2,4),labels = c("benign","malignant"))

breast$class<-factor(df$class,levels = c(2,4),labels = c("benign","malignant"))

library(rattle)

rattle()

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容