ppt 89 glm fenlei

#GLM

#logistic

install.packages("AER")

library(AER)

summary(Affairs)

table(Affairs$affairs)

fit.full <- glm(ynaffair ~ gender + age + yearsmarried + children +religiousness + education + occupation+rating,data=Affairs, family=binomial())

summary(fit.full)

fit.reduced <- glm(ynaffair ~ age + yearsmarried +religiousness +rating,data=Affairs, family=binomial())

summary(fit.reduced)

anova(fit.reduced, fit.full, test="Chisq")

testdata <- data.frame(rating=c(1, 2, 3, 4, 5), age=mean(Affairs$age),yearsmarried=mean(Affairs$yearsmarried),religiousness=mean(Affairs$religiousness))

testdata$prob <- predict(fit.reduced, newdata=testdata, type="response")

testdata <- data.frame(rating=mean(Affairs$rating),

age=seq(17, 57, 10),

yearsmarried=mean(Affairs$yearsmarried),

religiousness=mean(Affairs$religiousness))

testdata$prob <- predict(fit.reduced, newdata=testdata, type="response")

#对过度离势进行检验

deviance(fit.reduced)/df.residual(fit.reduced)

#置换检验 coin

library(coin)

score <- c(40, 57, 45, 55, 58, 57, 64, 55, 62, 65)

treatment <- factor(c(rep("A",5), rep("B",5)))

mydata <- data.frame(treatment, score)

t.test(score~treatment, data=mydata, var.equal=TRUE)

oneway_test(score~treatment, data=mydata, distribution="exact")

library(MASS)

UScrime <- transform(UScrime, So = factor(So))

wilcox_test(Prob ~ So, data=UScrime, distribution="exact")

library(multcomp)

set.seed(1234)

oneway_test(response~trt, data=cholesterol,

distribution=approximate(nresample=9999))

#卡方检验

library(vcd)

Arthritis <- transform(Arthritis,Improved=as.factor(as.numeric(Improved)))

chisq_test(Treatment~Improved,data = Arthritis,distribution = approximate(B=9999))

#线性模型置换检验lmPerm

require(lmPerm)

fit<-lmp(weight~height,data=women,perm="Prob")

summary(fit)

psych包：principal() 功能较强

#主成分

install.packages("psych")

library(psych)

pc<-principal(USJudgeRatings[,-1],nfactors=1)

pc<-principal(USJudgeRatings[,-1],nfactors=11, rotate = "varimax")

score <- pc$scores#获取主成分得分

#层次聚类分析

row.names(nutrient)<-tolower(row.names(nutrient))

nutrient.scaled<-scale(nutrient)

d<-dist(nutrient.scaled)

fit.average<-hclust(d,method = "average")

plot(fit.average,hang=-1,cex=.8,main="Average Linkage Clustering")

clusters <- cutree(fit.average, k=5)

aggregate(as.data.frame(nutrient.scaled),by=list(cluster=clusters),median)

plot(fit.average, hang=-1, cex=.8,

main="Average Linkage Clustering\n5 Cluster Solution")

rect.hclust(fit.average, k=5)

#分类算法

#数据准备

loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/"

ds <- "breast-cancer-wisconsin/breast-cancer-wisconsin.data"

url <- paste(loc, ds, sep="")

breast <- read.table(url, sep=",", header=FALSE, na.strings="?")

names(breast) <- c("ID", "clumpThickness", "sizeUniformity",

"shapeUniformity", "maginalAdhesion",

"singleEpithelialCellSize", "bareNuclei",

"blandChromatin", "normalNucleoli", "mitosis", "class")

df <- breast[-1]

df$class <- factor(df$class, levels=c(2,4),

labels=c("benign", "malignant"))

set.seed(1234)

#分为训练集和验证集

train <- sample(nrow(df), 0.7*nrow(df))

df.train <- df[train,]

df.validate <- df[-train,]

table(df.train$class)

table(df.validate$class)

#比较方法逻辑回归

fit.logit <- glm(class~., data=df.train, family=binomial())

prob <- predict(fit.logit, df.validate, type="response")

logit.pred <- factor(prob > .5, levels=c(FALSE, TRUE),

labels=c("benign", "malignant"))

logit.perf <- table(df.validate$class, logit.pred,

dnn=c("Actual", "Predicted"))

logit.perf

#计算AUC

install.packages("pROC")

library(pROC)

require(pROC)

auc = roc(df.validate$class,predict(fit.logit, df.validate))

plot(auc)

#比较方法决策树

library(rpart)

set.seed(1234)

dtree <- rpart(class ~ ., data=df.train, method="class", parms=list(split="information"))

dtree$cptable

dtree.pruned <- prune(dtree, cp=.0125)

dtree.pred <- predict(dtree.pruned,df.validate,type = "class")

install.packages("rpart.plot")

library(rpart.plot)

prp(dtree.pruned, type = 2, extra = 104,

fallen.leaves = TRUE, main="Decision Tree")

#随机森林

library(randomForest)

set.seed(1234)

fit.forest <- randomForest(class~., data=df.train,

na.action=na.roughfix,

importance=TRUE)

fit.forest

Import <- importance(fit.forest, type=2)

dotchart(import[order(import),] ,main = "Importance of RF",pch=16)

forest.pred <- predict(fit.forest, df.validate)

forest.perf <- table(df.validate$class, forest.pred,

dnn=c("Actual", "Predicted"))

forest.perf

#预测准确性度量

performance <- function(table, n=2){

if(!all(dim(table) == c(2,2)))

stop("Must be a 2 x 2 table")

tn = table[1,1]

fp = table[1,2]

fn = table[2,1]

tp = table[2,2]

sensitivity = tp/(tp+fn)

specificity = tn/(tn+fp)

ppp = tp/(tp+fp)

npp = tn/(tn+fn)

hitrate = (tp+tn)/(tp+tn+fp+fn)

result <- paste("Sensitivity = ", round(sensitivity, n) ,

"\nSpecificity = ", round(specificity, n),

"\nPositive Predictive Value = ", round(ppp, n),

"\nNegative Predictive Value = ", round(npp, n),

"\nAccuracy = ", round(hitrate, n), "\n", sep="")

cat(result)

}

#几种方法的比较

performance(logit.perf)

performance(dtree.perf)

performance(forest.perf)

ppt 89 glm fenlei

#GLM

#主成分

#层次聚类分析

#分类算法

#比较方法 逻辑回归

#比较方法 决策树

#随机森林

#预测准确性度量

#几种方法的比较

推荐阅读更多精彩内容

#比较方法逻辑回归

#比较方法决策树