#2021-22 exam
getwd()
library(caret)
caret
gbm
glmnet
ISLR
MASS
nnet
pROC
randomForest
dim(X.s.valid)
length(Y.valid)
#question4####
注意添加header=F在老师给的代码里,否则运不通
X = read.csv(file="Q4_X.csv")
Y = read.csv(file="Q4_Y.csv",stringsAsFactors=TRUE,header = F)[,1]
X.valid = read.csv(file="Q4_Xvalid.csv")
Y.valid = read.csv(file="Q4_Yvalid.csv",stringsAsFactors=TRUE,header = F)[,1]
X.s=scale(X)
as.data.frame(X.s)
X.s.valid = scale(X.valid)
# a)try:
# N=nrow(X)
# P=ncol(X)
# K=5
# Kmax = 30
# set.seed(6041)
# folds= cut(1:N,10,labels=FALSE)
# acc = matrix(NA,nrow=K,ncol=P)
# for(k in 1:Kmax){
# for(i in 1:K){
# itrain = which(folds!=i)
# ko = knn(X.s, X.s.valid, Y, k)
# tb = table(ko, Y.valid)
# acc[i,] = sum(diag(tb)) / sum(tb)
# }
# }
K=5
ko = knn(X.s, X.s.valid, Y, K)
(i)差一个cv!!!!
Kmax = 30
acc = numeric(Kmax)
for(k in 1:Kmax){
ko = knn(X.s, X.s.valid, Y, k)
tb = table(ko, Y.valid)
acc[i,] = sum(diag(tb)) / sum(tb)
}
plot(1-acc, pch=20, t="b", xlab='k')
#we get the highest accuracy when K=19.20,21
(ii)
mean(acc)#0.6513333
# a)
# set.seed(4061)
# subsets <- c(1:P)
# ctrl <- rfeControl(functions = rfFuncs,
# method = "cv",
# number = 5,
# verbose = FALSE)
# rf.rfe <- rfe(X.s, Y,
# sizes = subsets,
# rfeControl = ctrl)
#
# rf.rfe
#
# # how about cross-validating this?
# set.seed(4061) # for reproducibility
# K = 10
# folds = cut(1:n, breaks=K, labels=FALSE)
# sel = matrix(0, nrow=K, ncol=ncol(features))
# colnames(sel) = names(features)
# for(k in 1:K){
# itr = which(folds!=k)
# reg.bwd = regsubsets(sal~., data=dat[itr,],
# nvmax=ncol(features))
# opt.mod = which.max(summary(reg.bwd)$adjr2)
# isel = which(summary(reg.bwd)$which[opt.mod,-1])
# sel[k,isel] = 1
# }
# apply(sel,2,mean)*100
# # X1 and X14, again...
b)
set.seed(4061)
subsets <- c(1:P)
ctrl <- rfeControl(functions = rfFuncs,
method = "cv",
number = 5,
# method = "repeatedcv",
# repeats = 5,
verbose = FALSE)
rf.rfe <- rfe(X.s, Y,
sizes = subsets,
rfeControl = ctrl)
rf.rfe
(c)
predict(X.s.valid,rf.rfe)
rf.rfe$fit
rf
(d)
Kmax = 30
acc = numeric(Kmax)
for(k in 1:Kmax){
nno1 = nnet(X.s,as.data.frame(Y),size=k)
tb = table(nno1, Y.valid)
acc[k,] = sum(diag(tb)) / sum(tb)
}
plot(1-acc, pch=20, t="b", xlab='k')
#Question2####
dat.nona = na.omit(airquality)
dat = airquality
dat.nona$Month = as.factor(dat.nona$Month)
dat.nona$Day = as.factor(dat.nona$Day)
dat$Month = as.factor(dat$Month)
dat$Day = as.factor(dat$Day)
(a)
M = cor(x)#33*33的矩阵,列举每两个变量之间的关系
diag(M) = 0 #对角线上本来都是1,直接让它变0,便于写判断语句
which(abs(M)>.9)
#Question3####
library(ISLR) # for the data
library(randomForest)
library(gbm)
x.train = Khan$xtrain
x.test = Khan$xtest
y.train = as.factor(Khan$ytrain)
y.test = as.factor(Khan$ytest)
dim(x.train)
View(x.train)
(a)
par(mfrow=c(1,2))
boxplot(x.train~y.train)
boxplot(x.test~y.test)
不管训练还是测试样本,4个分类中的x区别度不高。。分布很相似,方差较大,均值都在0附近,但不是0,所以可以认为没有被归一化
但是variables(列数)远大于observations(行数),所以应该考虑over fitting的问题
# # Bartlett's test with H0: all variances are equal
# pval.train=pval.test=numeric(ncol(x.train))
# for(j in 1:ncol(x.train)){
# pval.train = bartlett.test(x.train[,j]~ y.train)$p.value
# pval.test = bartlett.test(x.test[,j]~ y.test)$p.value
# }
# #知道p-vlaue后应该如何分析:
# # p值<0.05,拒绝原假设,认为方差不一样
#
# # Shapiro's test with H0: the distribution is Normal
# pval.train=pval.test=numeric(ncol(x.train))
# for(j in 1:ncol(x.train)){
# pval.train = shapiro.test(x.train[,j]~ y.train)$p.value
# pval.test = shapiro.test(x.test[,j]~ y.test)$p.value
# }
(b)
classification
(c)
set.seed(4061)
rf.out = randomForest(x.train,y.train ,ntrees=1000)
rf.yhat = predict(rf.out, x.train, type="class")
confusionMatrix(rf.yhat,y.train)
(d)
rf.pred = predict(rf.out, x.test, type="class")
confusionMatrix(rf.pred,y.test)
# Accuracy : 0.95
(e)
which(rf.out$importance>0.4)
#246 509 545 1003 1194 1389 1645 1954 1955 2050
(f)
rf.out$importance[which(rf.out$importance>0.4)]
varImpPlot(rf.out)
(g)
f = as.formula(paste("y ~", paste(names(x.train[,1]), collapse = " + ")))
data.frame('y'=y.train,x.train)
gb.out = gbm(f,data=x.train,distribution="bernoulli",n.trees=100,interaction.depth=1)
gb.p = predict(gb.out, x.train, n.trees=100)
confusionMatrix()
?gbm
gb.out = train(y.train, data=x.train, method='gbm', distribution='bernoulli')
gb.fitted = predict(gb.out) # corresponding fitted values
gb.pred = predict(gb.out, dat.validation)
confusionMatrix(reference=dat.validation$Salary, data=gb.pred,
mode="everything")
Question1
(a)(1)tree
(2)Classification problem , the predictors should be ‘young’/ ‘intermediate’ or ‘Mature’
(3)Obs1 = young
Obs2 = intermediate
Obs3 = mature
Obs4 = mature
Obs5 = young
(c)randomForest
(d)bagging
(e)通过集成学习法,让机器多次学习训练样本。生成不同的树进行拟合,再取平均,可以提升预测的精准性。降低了偏差