kilkjt1

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

# Exercises Section 4: Tree-based methods

# --------------------------------------------------------

###############################################################

### Exercise 1: growing and pruning a tree

###############################################################

library(ISLR) # contains the dataset

library(tree) # contains... tree-building methods

# Recode response variable so as to make it a classification problem

High = ifelse(Carseats$Sales<=8, "No", "Yes")

# Create a data frame that includes both the predictors and response

# (a data frame is like an Excel spreadsheet, if you like)

CS = data.frame(Carseats, High)

CS$Sales = NULL

CS$High = as.factor(CS$High) # <-- this bit was missing

# Fit the tree using all predictors (except for variable Sales,

# which we have "recoded" into a cateorical response variable)

# to response variable High

tree.out = tree(High~., CS)

summary(tree.out)

# plot the tree

plot(tree.out)

text(tree.out, pretty=0)

# pruning:

set.seed(3)

cv.CS = cv.tree(tree.out, FUN=prune.misclass)#cross validation 交叉验证

?cv.tree

prune.

names(cv.CS)

# - size:

# number of terminal nodes in each tree in the cost-complexity pruning sequence.

# - deviance:

# total deviance of each tree in the cost-complexity pruning sequence.

# - k:

# the value of the cost-complexity pruning parameter of each tree in the sequence.

cv.CS

par(mfrow=c(1,2))

plot(cv.CS$size,cv.CS$dev,t='b')

abline(v=cv.CS$size[which.min(cv.CS$dev)])

plot(cv.CS$k,cv.CS$dev,t='b')

plot(cv.CS$size,cv.CS$k,t = 'b')

# use pruning:

# - use which.min(cv.CS$dev) to get the location of the optimum

# - retrieve the corresponding tree size

# - pass this information on to pruning function

opt.size = cv.CS$size[which.min(cv.CS$dev)]

# see:

plot(cv.CS$size,cv.CS$dev,t='b')

abline(v=cv.CS$size[which.min(cv.CS$dev)])

ptree = prune.misclass(tree.out, best=opt.size)

ptree

summary(ptree)

par(mfrow=c(1,2))

plot(tree.out)

text(tree.out, pretty=0)

plot(ptree)

text(ptree, pretty=0)

###############################################################

### Exercise 2: apply CV and ROC analysis

###############################################################

# Train/test:

set.seed(2)

n = nrow(CS)

itrain = sample(1:n, 200)

CS.test = CS[-itrain,]

High.test = High[-itrain]

# argument 'subset' makes it easy to handle training/test splits:

tree.out = tree(High~., CS, subset=itrain)

summary(tree.out)

plot(tree.out)

text(tree.out, pretty=0)

# prediction from full tree:

tree.pred = predict(tree.out, CS.test, type="class")

(tb1 = table(tree.pred,High.test))

# tree.pred1 = predict(tree.out,CS.test,type = 'where')

# prediction from pruned tree:

ptree.pred = predict(ptree, CS.test, type="class")

(tb2 = table(ptree.pred,High.test)) # confusion matrix

sum(diag(tb1))/sum(tb1)

sum(diag(tb2))/sum(tb2)

# perform ROC analysis

library(pROC)

# here we specify 'type="vector"' to retrieve continuous scores

# as opposed to predicted labels, so that we can apply varying

# threshold values to these scores to build the ROC curve:

ptree.probs = predict(ptree, CS.test, type="vector")

roc.p = roc(response=(High.test), predictor=ptree.probs[,1])

roc.p$auc

plot(roc.p)

###############################################################

### Exercise 3: find the tree

###############################################################

# ... can you find it?

# ISLR_Hitters <- ISLR::Hitters

# tree.out = tree( ISLR_Hitters)

# plot(tree.out)

# text(tree.out,pretty = 0)

###############################################################

### Exercise 4: grow a random forest

###############################################################

library(tree)

library(ISLR)

library(randomForest)

# ?Carseats

High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))

CS = data.frame(Carseats, High)

CS$Sales = NULL

names(CS)

P = ncol(CS)-1 #聽number of features

# grow a single (unpruned) tree

tree.out = tree(High~., CS)

# fitted values for "training set"

tree.yhat = predict(tree.out, CS, type="class")

# grow a forest:

rf.out = randomForest(High~., CS)

# fitted values for "training set"

rf.yhat = predict(rf.out, CS, type="class")

# compare to bagging:

bag.out = randomForest(High~., CS, mtry=P)

# fitted values for "training set"

bag.yhat = predict(bag.out, CS, type="class")

# confusion matrix for tree:

(tb.tree = table(tree.yhat, High))

# confusion matrix for RF

(tb.rf = table(rf.yhat, High))

# Note this is different to the confusion matrix for the OOB observations:

(tb.rf2 = rf.out$confusion)

# confusion matrix for bagging

(tb.bag = table(bag.yhat, High))

(tb.bag2 = bag.out$confusion)

sum(diag(tb.tree))/sum(tb.tree)

sum(diag(tb.rf))/sum(tb.rf)

sum(diag(tb.bag))/sum(tb.bag)

sum(diag(tb.rf2))/sum(tb.rf2)

# train-test split

set.seed(6041)

N = nrow(CS)

itrain = sample(1:N, 200)

CS.train = CS[itrain,]

CS.test = CS[-itrain,]

tree.out = tree(High~., CS.train)

# fitted values for "train set"

tree.yhat = predict(tree.out, CS.train, type="class")

# fitted values for "test set"

tree.pred = predict(tree.out, CS.test, type="class")

rf.out = randomForest(High~., CS.train)

# fitted values for "training set"

rf.yhat = predict(rf.out, CS.train, type="class")

# fitted values for "test set"

rf.pred = predict(rf.out, CS.test, type="class")

bag.out = randomForest(High~., CS.train, mtry=(ncol(CS)-2))

# fitted values for "training set"

bag.yhat = predict(bag.out, CS.train, type="class")

# fitted values for "test set"

bag.pred = predict(bag.out, CS.test, type="class")

# confusion matrix for tree (test data):

(tb.tree = table(tree.pred, CS.test$High))

# confusion matrix for RF (test data):

(tb.rf = table(rf.pred, CS.test$High))

# confusion matrix for Bagging (test data):

(tb.bag = table(bag.pred, CS.test$High))

sum(diag(tb.tree))/sum(tb.tree)

sum(diag(tb.rf))/sum(tb.rf)

sum(diag(tb.bag))/sum(tb.bag)

###############################################################

### Exercise 5: benchmarking (this exercise is left as homework)

###############################################################

# bring in that code from Section 2 (below) and add to it:

library(class) # contains knn()

library(ISLR) # contains the datasets

library(pROC)

library(tree)

library(randomForest)

## (1) benchmarking on unscaled data

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

i.cv = sample(1:n, round(.7*n), replace=FALSE)

dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV)

# tuning of the classifiers:

K.knn = 3

K = 10

N = length(i.cv)

folds = cut(1:N, K, labels=FALSE)

acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)

auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)

acc.rf = auc.rf = numeric(K)

for(k in 1:K){ # 10-fold CV loop

# split into train and test samples:

i.train = which(folds!=k)

dat.train = dat.cv[i.train, ]

dat.test = dat.cv[-i.train, ]

# adapt these sets for kNN:

x.train = dat.train[,-1]

y.train = dat.train[,1]

x.test = dat.test[,-1]

y.test = dat.test[,1]

x.train[,1] = as.numeric(x.train[,1])

x.test[,1] = as.numeric(x.test[,1])

# train classifiers:

knn.o = knn(x.train, x.test, y.train, K.knn)

glm.o = glm(default~., data=dat.train, family=binomial(logit))

lda.o = lda(default~., data=dat.train)

qda.o = qda(default~., data=dat.train)

rf.o = randomForest(default~., data=dat.train)

# test classifiers:

# (notice that predict.glm() does not have a functionality to

# return categorical values, so we copmute them based on the

# scores by applying a threshold of 50%)

knn.p = knn.o

glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )

lda.p = predict(lda.o, newdata=dat.test)$class

qda.p = predict(qda.o, newdata=dat.test)$class

rf.p = predict(rf.o, newdata=dat.test)

# corresponding confusion matrices:

tb.knn = table(knn.p, y.test)

tb.glm = table(glm.p, y.test)

tb.lda = table(lda.p, y.test)

tb.qda = table(qda.p, y.test)

tb.rf = table(rf.p, y.test)

# store prediction accuracies:

acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)

acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)

acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)

acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)

acc.rf[k] = sum(diag(tb.rf)) / sum(tb.rf)

# ROC/AUC analysis:

# WARNING: THIS IS NOT PR(Y=1 | X), BUT Pr(Y = Y_hat | X):

# knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob

glm.p = predict(glm.o, newdata=dat.test, type="response")

lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]

qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]

# auc.knn[k] = roc(y.test, knn.p)$auc

auc.glm[k] = roc(y.test, glm.p)$auc

auc.lda[k] = roc(y.test, lda.p)$auc

auc.qda[k] = roc(y.test, qda.p)$auc

}

boxplot(acc.knn, acc.glm, acc.lda, acc.qda,

main="Overall CV prediction accuracy",

names=c("kNN","GLM","LDA","QDA"))

boxplot(auc.glm, auc.lda, auc.qda,

main="Overall CV AUC",

names=c("GLM","LDA","QDA"))

boxplot(auc.knn, auc.glm, auc.lda, auc.qda,

main="Overall CV AUC",

names=c("kNN","GLM","LDA","QDA"))

###############################################################

### Exercise 6: Variable importance from RF

###############################################################

library(ISLR)

library(randomForest)

# ?Carseats

High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))

CS = data.frame(Carseats, High)

CS$Sales = NULL

# grow a forest:

rf.out = randomForest(High~., CS)

# compare to bagging:

bag.out = randomForest(High~., CS, mtry=(ncol(CS)-1))

cbind(rf.out$importance, bag.out$importance)

par(mfrow=c(1,2))

varImpPlot(rf.out, pch=15, main="Ensemble method 1")

varImpPlot(bag.out, pch=15, main="Ensemble method 2")

###############################################################

### Exercise 7: gradient boosting

###############################################################

library(ISLR) # contains the dataset

library(tree) # contains... tree-building methods

# install.packages('gbm')

library(gbm) # contains the GB model implementation

library(pROC)

# Recode response variable so as to make it a classification problem

High = as.factor(ifelse(Carseats$Sales<=8, "No", "Yes"))

CS = data.frame(Carseats, High)

# remove Sales from data frame, just to make formulas simpler!

CS$Sales = NULL

set.seed(1)

itest = sample(1:nrow(CS), round(nrow(CS)/4))

CS.test = CS[itest,]

CS = CS[-itest,]

set.seed(4061)

# Note:

# gbm(High~., data=CS, distribution="bernoulli")

# so we must recode the levels...

CS$High = (as.numeric(CS$High=="Yes")) # this hack could be useful

gb.out = gbm(High~., data=CS,

distribution="bernoulli", # use "gaussian" instead for regression

n.trees=5000, # size of the ensemble

interaction.depth=1) # depth of the trees, 1 = stumps

summary(gb.out$train.error)

# inspect output:

par(mar=c(4.5,6,1,1))

summary(gb.out, las=1)

plot(gb.out, i="Price")

plot(gb.out, i="ShelveLoc")

plot(gb.out)

gb.p = predict(gb.out, newdata=CS.test, n.trees=5000)

gb.p

roc.gb = roc(response=CS.test$High, predictor=gb.p)

plot(roc.gb)

roc.gb$auc

# compare AUC's with a Random Forest:

library(randomForest)

CS$High = as.factor(CS$High)

rf.out = randomForest(High~., CS, ntree=5000)

# fitted values for "training set"

rf.p = predict(rf.out, CS.test, type="prob")[,2]

roc.rf = roc(response=CS.test$High, predictor=rf.p)

plot(roc.rf, add=TRUE, col=2)

roc.gb$auc

roc.rf$auc

###############################################################

### Exercise 8: gradient boosting using caret...

###############################################################

# Plug in the following snips of code within demo code

# Section4b_demo_using_caret.R:

############ (I) Classification example ############

### Gradient boosting (using caret) for classification

rm(list=ls()) #聽clear the environment

library(ISLR) # contains the data

library(caret) # contains everything else

set.seed(4061) # for reproducibility

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

n = nrow(dat)

NC = ncol(dat)

# Change the response variable to a factor to make this a

# classification problem:

dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),

"High","Low"))

# Data partition

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,] # independent validation set for later

# x = select(dat.train,-"Salary") ###聽if using dplyr

# training set:

x = dat.train

x$Salary = NULL

y = as.factor(dat.train$Salary)

gb.out = train(Salary~., data=dat.train, method='gbm', distribution='bernoulli')

gb.fitted = predict(gb.out) # corresponding fitted values

gb.pred = predict(gb.out, dat.validation)

confusionMatrix(reference=dat.validation$Salary, data=gb.pred,

mode="everything")

############ (II) Regression example ############

### Gradient boosting (using caret) for regression

rm(list=ls()) #聽clear the environment

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

# hist(dat$Salary)

dat$Salary = log(dat$Salary)

n = nrow(dat)

NC = ncol(dat)

# Data partition

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,]

x = dat.train

x$Salary = NULL

y = dat.train$Salary

ytrue = dat.validation$Salary

gb.out = train(Salary~., data=dat.train, method='gbm', distribution='gaussian')

gb.fitted = predict(gb.out) # corresponding fitted values

gb.pred = predict(gb.out, dat.validation)

mean((gb.pred-ytrue)^2)

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 219,039评论 6赞 508
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 93,426评论 3赞 395
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 165,417评论 0赞 356
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 58,868评论 1赞 295
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 67,892评论 6赞 392
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 51,692评论 1赞 305
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 40,416评论 3赞 419
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 39,326评论 0赞 276
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 45,782评论 1赞 316
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 37,957评论 3赞 337
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 40,102评论 1赞 350
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 35,790评论 5赞 346
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 41,442评论 3赞 331
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 31,996评论 0赞 22
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 33,113评论 1赞 272
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 48,332评论 3赞 373
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 45,044评论 2赞 355

kilkjt1

推荐阅读更多精彩内容