#GLM
#logistic
install.packages("AER")
library(AER)
summary(Affairs)
table(Affairs$affairs)
fit.full <- glm(ynaffair ~ gender + age + yearsmarried + children +religiousness + education + occupation+rating,data=Affairs, family=binomial())
summary(fit.full)
fit.reduced <- glm(ynaffair ~ age + yearsmarried +religiousness +rating,data=Affairs, family=binomial())
summary(fit.reduced)
anova(fit.reduced, fit.full, test="Chisq")
testdata <- data.frame(rating=c(1, 2, 3, 4, 5), age=mean(Affairs$age),yearsmarried=mean(Affairs$yearsmarried),religiousness=mean(Affairs$religiousness))
testdata$prob <- predict(fit.reduced, newdata=testdata, type="response")
testdata <- data.frame(rating=mean(Affairs$rating),
age=seq(17, 57, 10),
yearsmarried=mean(Affairs$yearsmarried),
religiousness=mean(Affairs$religiousness))
testdata$prob <- predict(fit.reduced, newdata=testdata, type="response")
#对过度离势进行检验
deviance(fit.reduced)/df.residual(fit.reduced)
#置换检验 coin
library(coin)
score <- c(40, 57, 45, 55, 58, 57, 64, 55, 62, 65)
treatment <- factor(c(rep("A",5), rep("B",5)))
mydata <- data.frame(treatment, score)
t.test(score~treatment, data=mydata, var.equal=TRUE)
oneway_test(score~treatment, data=mydata, distribution="exact")
library(MASS)
UScrime <- transform(UScrime, So = factor(So))
wilcox_test(Prob ~ So, data=UScrime, distribution="exact")
library(multcomp)
set.seed(1234)
oneway_test(response~trt, data=cholesterol,
distribution=approximate(nresample=9999))
#卡方检验
library(vcd)
Arthritis <- transform(Arthritis,Improved=as.factor(as.numeric(Improved)))
chisq_test(Treatment~Improved,data = Arthritis,distribution = approximate(B=9999))
#线性模型置换检验lmPerm
require(lmPerm)
fit<-lmp(weight~height,data=women,perm="Prob")
summary(fit)
psych包:principal() 功能较强
#主成分
install.packages("psych")
library(psych)
pc<-principal(USJudgeRatings[,-1],nfactors=1)
pc<-principal(USJudgeRatings[,-1],nfactors=11, rotate = "varimax")
score <- pc$scores#获取主成分得分
#层次聚类分析
row.names(nutrient)<-tolower(row.names(nutrient))
nutrient.scaled<-scale(nutrient)
d<-dist(nutrient.scaled)
fit.average<-hclust(d,method = "average")
plot(fit.average,hang=-1,cex=.8,main="Average Linkage Clustering")
clusters <- cutree(fit.average, k=5)
aggregate(as.data.frame(nutrient.scaled),by=list(cluster=clusters),median)
plot(fit.average, hang=-1, cex=.8,
main="Average Linkage Clustering\n5 Cluster Solution")
rect.hclust(fit.average, k=5)
#分类算法
#数据准备
loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/"
ds <- "breast-cancer-wisconsin/breast-cancer-wisconsin.data"
url <- paste(loc, ds, sep="")
breast <- read.table(url, sep=",", header=FALSE, na.strings="?")
names(breast) <- c("ID", "clumpThickness", "sizeUniformity",
"shapeUniformity", "maginalAdhesion",
"singleEpithelialCellSize", "bareNuclei",
"blandChromatin", "normalNucleoli", "mitosis", "class")
df <- breast[-1]
df$class <- factor(df$class, levels=c(2,4),
labels=c("benign", "malignant"))
set.seed(1234)
#分为训练集和验证集
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]
df.validate <- df[-train,]
table(df.train$class)
table(df.validate$class)
#比较方法 逻辑回归
fit.logit <- glm(class~., data=df.train, family=binomial())
prob <- predict(fit.logit, df.validate, type="response")
logit.pred <- factor(prob > .5, levels=c(FALSE, TRUE),
labels=c("benign", "malignant"))
logit.perf <- table(df.validate$class, logit.pred,
dnn=c("Actual", "Predicted"))
logit.perf
#计算AUC
install.packages("pROC")
library(pROC)
require(pROC)
auc = roc(df.validate$class,predict(fit.logit, df.validate))
plot(auc)
#比较方法 决策树
library(rpart)
set.seed(1234)
dtree <- rpart(class ~ ., data=df.train, method="class", parms=list(split="information"))
dtree$cptable
dtree.pruned <- prune(dtree, cp=.0125)
dtree.pred <- predict(dtree.pruned,df.validate,type = "class")
install.packages("rpart.plot")
library(rpart.plot)
prp(dtree.pruned, type = 2, extra = 104,
fallen.leaves = TRUE, main="Decision Tree")
#随机森林
library(randomForest)
set.seed(1234)
fit.forest <- randomForest(class~., data=df.train,
na.action=na.roughfix,
importance=TRUE)
fit.forest
Import <- importance(fit.forest, type=2)
dotchart(import[order(import),] ,main = "Importance of RF",pch=16)
forest.pred <- predict(fit.forest, df.validate)
forest.perf <- table(df.validate$class, forest.pred,
dnn=c("Actual", "Predicted"))
forest.perf
#预测准确性度量
performance <- function(table, n=2){
if(!all(dim(table) == c(2,2)))
stop("Must be a 2 x 2 table")
tn = table[1,1]
fp = table[1,2]
fn = table[2,1]
tp = table[2,2]
sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
ppp = tp/(tp+fp)
npp = tn/(tn+fn)
hitrate = (tp+tn)/(tp+tn+fp+fn)
result <- paste("Sensitivity = ", round(sensitivity, n) ,
"\nSpecificity = ", round(specificity, n),
"\nPositive Predictive Value = ", round(ppp, n),
"\nNegative Predictive Value = ", round(npp, n),
"\nAccuracy = ", round(hitrate, n), "\n", sep="")
cat(result)
}
#几种方法的比较
performance(logit.perf)
performance(dtree.perf)
performance(forest.perf)