BBJJ-11111

#2021-22 exam

getwd()

library(caret)

caret

gbm

glmnet

ISLR

MASS

nnet

pROC

randomForest

dim(X.s.valid)

length(Y.valid)

#question4####

注意添加header=F在老师给的代码里,否则运不通

X = read.csv(file="Q4_X.csv")

Y = read.csv(file="Q4_Y.csv",stringsAsFactors=TRUE,header = F)[,1]

X.valid = read.csv(file="Q4_Xvalid.csv")

Y.valid = read.csv(file="Q4_Yvalid.csv",stringsAsFactors=TRUE,header = F)[,1]

X.s=scale(X)

as.data.frame(X.s)

X.s.valid = scale(X.valid)

# a)try:

# N=nrow(X)

# P=ncol(X)

# K=5

# Kmax = 30

# set.seed(6041)

# folds= cut(1:N,10,labels=FALSE)

# acc = matrix(NA,nrow=K,ncol=P)

# for(k in 1:Kmax){

# for(i in 1:K){

#  itrain = which(folds!=i)

#  ko = knn(X.s, X.s.valid, Y, k)

#  tb = table(ko, Y.valid)

#  acc[i,] = sum(diag(tb)) / sum(tb)

#  }

# }

K=5

ko = knn(X.s, X.s.valid, Y, K)

(i)差一个cv!!!!

Kmax = 30

acc = numeric(Kmax)

for(k in 1:Kmax){

  ko = knn(X.s, X.s.valid, Y, k)

  tb = table(ko, Y.valid)

  acc[i,] = sum(diag(tb)) / sum(tb)

}

plot(1-acc, pch=20, t="b", xlab='k')

#we get the highest accuracy when K=19.20,21

(ii)

mean(acc)#0.6513333

# a)

# set.seed(4061)

# subsets <- c(1:P)

# ctrl <- rfeControl(functions = rfFuncs,

#                    method = "cv",

#                    number = 5,

#                    verbose = FALSE)

# rf.rfe <- rfe(X.s, Y,

#              sizes = subsets,

#              rfeControl = ctrl)

#

# rf.rfe

#

# # how about cross-validating this?

# set.seed(4061) # for reproducibility

# K = 10

# folds = cut(1:n, breaks=K, labels=FALSE)

# sel = matrix(0, nrow=K, ncol=ncol(features))

# colnames(sel) = names(features)

# for(k in 1:K){

#  itr = which(folds!=k)

#  reg.bwd = regsubsets(sal~., data=dat[itr,],

#                        nvmax=ncol(features))

#  opt.mod = which.max(summary(reg.bwd)$adjr2)

#  isel = which(summary(reg.bwd)$which[opt.mod,-1])

#  sel[k,isel] = 1

# }

# apply(sel,2,mean)*100

# # X1 and X14, again...

b)

set.seed(4061)

subsets <- c(1:P)

ctrl <- rfeControl(functions = rfFuncs,

                    method = "cv",

                    number = 5,

                    # method = "repeatedcv",

                    # repeats = 5,

                    verbose = FALSE)

rf.rfe <- rfe(X.s, Y,

              sizes = subsets,

              rfeControl = ctrl)

rf.rfe

(c)

predict(X.s.valid,rf.rfe)

rf.rfe$fit

rf

(d)

Kmax = 30

acc = numeric(Kmax)

for(k in 1:Kmax){

  nno1 = nnet(X.s,as.data.frame(Y),size=k)

  tb = table(nno1, Y.valid)

  acc[k,] = sum(diag(tb)) / sum(tb)

}

plot(1-acc, pch=20, t="b", xlab='k')

#Question2####

dat.nona = na.omit(airquality)

dat = airquality

dat.nona$Month = as.factor(dat.nona$Month)

dat.nona$Day = as.factor(dat.nona$Day)

dat$Month = as.factor(dat$Month)

dat$Day = as.factor(dat$Day)

(a)

M = cor(x)#33*33的矩阵,列举每两个变量之间的关系

diag(M) = 0 #对角线上本来都是1,直接让它变0,便于写判断语句

which(abs(M)>.9)

#Question3####

library(ISLR) # for the data

library(randomForest)

library(gbm)

x.train = Khan$xtrain

x.test = Khan$xtest

y.train = as.factor(Khan$ytrain)

y.test = as.factor(Khan$ytest)

dim(x.train)

View(x.train)

(a)

par(mfrow=c(1,2))

boxplot(x.train~y.train)

boxplot(x.test~y.test)

不管训练还是测试样本,4个分类中的x区别度不高。。分布很相似,方差较大,均值都在0附近,但不是0,所以可以认为没有被归一化

但是variables(列数)远大于observations(行数),所以应该考虑over fitting的问题

# # Bartlett's test with H0: all variances are equal

# pval.train=pval.test=numeric(ncol(x.train))

# for(j in 1:ncol(x.train)){

#  pval.train =  bartlett.test(x.train[,j]~ y.train)$p.value

#  pval.test =  bartlett.test(x.test[,j]~ y.test)$p.value

# }

# #知道p-vlaue后应该如何分析:

# # p值<0.05,拒绝原假设,认为方差不一样

#

# # Shapiro's test with H0: the distribution is Normal

# pval.train=pval.test=numeric(ncol(x.train))

# for(j in 1:ncol(x.train)){

#  pval.train =  shapiro.test(x.train[,j]~ y.train)$p.value

#  pval.test =  shapiro.test(x.test[,j]~ y.test)$p.value

# }

(b)

classification

(c)

set.seed(4061)

rf.out = randomForest(x.train,y.train ,ntrees=1000)

rf.yhat = predict(rf.out, x.train, type="class")

confusionMatrix(rf.yhat,y.train)

(d)

rf.pred = predict(rf.out, x.test, type="class")

confusionMatrix(rf.pred,y.test)

# Accuracy : 0.95

(e)

which(rf.out$importance>0.4)

#246  509  545 1003 1194 1389 1645 1954 1955 2050

(f)

rf.out$importance[which(rf.out$importance>0.4)]

varImpPlot(rf.out)

(g)

f = as.formula(paste("y ~", paste(names(x.train[,1]), collapse = " + ")))

data.frame('y'=y.train,x.train)

gb.out = gbm(f,data=x.train,distribution="bernoulli",n.trees=100,interaction.depth=1)

gb.p = predict(gb.out, x.train, n.trees=100)

confusionMatrix()

?gbm

gb.out = train(y.train, data=x.train, method='gbm', distribution='bernoulli')

gb.fitted = predict(gb.out) # corresponding fitted values

gb.pred = predict(gb.out, dat.validation)

confusionMatrix(reference=dat.validation$Salary, data=gb.pred,

                mode="everything")





Question1

(a)(1)tree

(2)Classification problem , the predictors should be ‘young’/ ‘intermediate’ or ‘Mature’

(3)Obs1 = young

Obs2 = intermediate

Obs3 = mature

Obs4 = mature

Obs5 = young

(c)randomForest

(d)bagging

(e)通过集成学习法,让机器多次学习训练样本。生成不同的树进行拟合,再取平均,可以提升预测的精准性。降低了偏差

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容

友情链接更多精彩内容