kilkjt1

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

# Exercises Section 4: Tree-based methods

# --------------------------------------------------------

###############################################################

### Exercise 1: growing and pruning a tree

###############################################################

library(ISLR) # contains the dataset

library(tree) # contains... tree-building methods

# Recode response variable so as to make it a classification problem

High = ifelse(Carseats$Sales<=8, "No", "Yes")

# Create a data frame that includes both the predictors and response

# (a data frame is like an Excel spreadsheet, if you like)

CS = data.frame(Carseats, High)

CS$Sales = NULL

CS$High = as.factor(CS$High) # <-- this bit was missing

# Fit the tree using all predictors (except for variable Sales,

# which we have "recoded" into a cateorical response variable)

# to response variable High

tree.out = tree(High~., CS)

summary(tree.out)

# plot the tree

plot(tree.out)

text(tree.out, pretty=0)

# pruning:

set.seed(3)

cv.CS = cv.tree(tree.out, FUN=prune.misclass)#cross validation 交叉验证

?cv.tree

prune.

names(cv.CS)

# - size:

# number of terminal nodes in each tree in the cost-complexity pruning sequence.

# - deviance:

# total deviance of each tree in the cost-complexity pruning sequence.

# - k:

# the value of the cost-complexity pruning parameter of each tree in the sequence.

cv.CS

par(mfrow=c(1,2))

plot(cv.CS$size,cv.CS$dev,t='b')

abline(v=cv.CS$size[which.min(cv.CS$dev)])

plot(cv.CS$k,cv.CS$dev,t='b')

plot(cv.CS$size,cv.CS$k,t = 'b')

# use pruning:

# - use which.min(cv.CS$dev) to get the location of the optimum

# - retrieve the corresponding tree size

# - pass this information on to pruning function

opt.size = cv.CS$size[which.min(cv.CS$dev)]

# see:

plot(cv.CS$size,cv.CS$dev,t='b')

abline(v=cv.CS$size[which.min(cv.CS$dev)])

ptree = prune.misclass(tree.out, best=opt.size)

ptree

summary(ptree)

par(mfrow=c(1,2))

plot(tree.out)

text(tree.out, pretty=0)

plot(ptree)

text(ptree, pretty=0)

###############################################################

### Exercise 2: apply CV and ROC analysis

###############################################################

# Train/test:

set.seed(2)

n = nrow(CS)

itrain = sample(1:n, 200)

CS.test = CS[-itrain,]

High.test = High[-itrain]

# argument 'subset' makes it easy to handle training/test splits:

tree.out = tree(High~., CS, subset=itrain)

summary(tree.out)

plot(tree.out)

text(tree.out, pretty=0)

# prediction from full tree:

tree.pred = predict(tree.out, CS.test, type="class")

(tb1 = table(tree.pred,High.test))

# tree.pred1 = predict(tree.out,CS.test,type = 'where')

# prediction from pruned tree:

ptree.pred = predict(ptree, CS.test, type="class")

(tb2 = table(ptree.pred,High.test)) # confusion matrix

sum(diag(tb1))/sum(tb1)

sum(diag(tb2))/sum(tb2)

# perform ROC analysis

library(pROC)

# here we specify 'type="vector"' to retrieve continuous scores

# as opposed to predicted labels, so that we can apply varying

# threshold values to these scores to build the ROC curve:

ptree.probs = predict(ptree, CS.test, type="vector")

roc.p = roc(response=(High.test), predictor=ptree.probs[,1])

roc.p$auc

plot(roc.p)

###############################################################

### Exercise 3: find the tree

###############################################################

# ... can you find it?

# ISLR_Hitters <- ISLR::Hitters

# tree.out = tree( ISLR_Hitters)

# plot(tree.out)

# text(tree.out,pretty = 0)

###############################################################

### Exercise 4: grow a random forest

###############################################################

library(tree)

library(ISLR)

library(randomForest)

# ?Carseats

High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))

CS = data.frame(Carseats, High)

CS$Sales = NULL

names(CS)

P = ncol(CS)-1  #聽number of features

# grow a single (unpruned) tree

tree.out = tree(High~., CS)

# fitted values for "training set"

tree.yhat = predict(tree.out, CS, type="class")

# grow a forest:

rf.out = randomForest(High~., CS)

# fitted values for "training set"

rf.yhat = predict(rf.out, CS, type="class")

# compare to bagging:

bag.out = randomForest(High~., CS, mtry=P)

# fitted values for "training set"

bag.yhat = predict(bag.out, CS, type="class")

# confusion matrix for tree:

(tb.tree = table(tree.yhat, High))

# confusion matrix for RF

(tb.rf = table(rf.yhat, High))

# Note this is different to the confusion matrix for the OOB observations:

(tb.rf2 = rf.out$confusion)

# confusion matrix for bagging

(tb.bag = table(bag.yhat, High))

(tb.bag2 = bag.out$confusion)

sum(diag(tb.tree))/sum(tb.tree)

sum(diag(tb.rf))/sum(tb.rf)

sum(diag(tb.bag))/sum(tb.bag)

sum(diag(tb.rf2))/sum(tb.rf2)

# train-test split

set.seed(6041)

N = nrow(CS)

itrain = sample(1:N, 200)

CS.train = CS[itrain,]

CS.test = CS[-itrain,]

tree.out = tree(High~., CS.train)

# fitted values for "train set"

tree.yhat = predict(tree.out, CS.train, type="class")

# fitted values for "test set"

tree.pred = predict(tree.out, CS.test, type="class")

rf.out = randomForest(High~., CS.train)

# fitted values for "training set"

rf.yhat = predict(rf.out, CS.train, type="class")

# fitted values for "test set"

rf.pred = predict(rf.out, CS.test, type="class")

bag.out = randomForest(High~., CS.train, mtry=(ncol(CS)-2))

# fitted values for "training set"

bag.yhat = predict(bag.out, CS.train, type="class")

# fitted values for "test set"

bag.pred = predict(bag.out, CS.test, type="class")

# confusion matrix for tree (test data):

(tb.tree = table(tree.pred, CS.test$High))

# confusion matrix for RF (test data):

(tb.rf = table(rf.pred, CS.test$High))

# confusion matrix for Bagging (test data):

(tb.bag = table(bag.pred, CS.test$High))

sum(diag(tb.tree))/sum(tb.tree)

sum(diag(tb.rf))/sum(tb.rf)

sum(diag(tb.bag))/sum(tb.bag)

###############################################################

### Exercise 5: benchmarking (this exercise is left as homework)

###############################################################

# bring in that code from Section 2 (below) and add to it:

library(class) # contains knn()

library(ISLR) # contains the datasets

library(pROC)

library(tree)

library(randomForest)

## (1) benchmarking on unscaled data

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

i.cv = sample(1:n, round(.7*n), replace=FALSE)

dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV)

# tuning of the classifiers:

K.knn = 3

K = 10

N = length(i.cv)

folds = cut(1:N, K, labels=FALSE)

acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)

auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)

acc.rf = auc.rf = numeric(K)

#

for(k in 1:K){ # 10-fold CV loop

# split into train and test samples:

i.train = which(folds!=k)

dat.train = dat.cv[i.train, ]

dat.test = dat.cv[-i.train, ]

# adapt these sets for kNN:

x.train = dat.train[,-1]

y.train = dat.train[,1]

x.test = dat.test[,-1]

y.test = dat.test[,1]

x.train[,1] = as.numeric(x.train[,1])

x.test[,1] = as.numeric(x.test[,1])

# train classifiers:

knn.o = knn(x.train, x.test, y.train, K.knn)

glm.o = glm(default~., data=dat.train, family=binomial(logit))

lda.o = lda(default~., data=dat.train)

qda.o = qda(default~., data=dat.train)

rf.o = randomForest(default~., data=dat.train)

# test classifiers:

# (notice that predict.glm() does not have a functionality to

# return categorical values, so we copmute them based on the

# scores by applying a threshold of 50%)

knn.p = knn.o

glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )

lda.p = predict(lda.o, newdata=dat.test)$class

qda.p = predict(qda.o, newdata=dat.test)$class

rf.p = predict(rf.o, newdata=dat.test)

# corresponding confusion matrices:

tb.knn = table(knn.p, y.test)

tb.glm = table(glm.p, y.test)

tb.lda = table(lda.p, y.test)

tb.qda = table(qda.p, y.test)

tb.rf = table(rf.p, y.test)

# store prediction accuracies:

acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)

acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)

acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)

acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)

acc.rf[k] = sum(diag(tb.rf)) / sum(tb.rf)

#

# ROC/AUC analysis:

# WARNING: THIS IS NOT PR(Y=1 | X), BUT Pr(Y = Y_hat | X):

# knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob

glm.p = predict(glm.o, newdata=dat.test, type="response")

lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]

qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]

# auc.knn[k] = roc(y.test, knn.p)$auc

auc.glm[k] = roc(y.test, glm.p)$auc

auc.lda[k] = roc(y.test, lda.p)$auc

auc.qda[k] = roc(y.test, qda.p)$auc

}

boxplot(acc.knn, acc.glm, acc.lda, acc.qda,

main="Overall CV prediction accuracy",

names=c("kNN","GLM","LDA","QDA"))

boxplot(auc.glm, auc.lda, auc.qda,

main="Overall CV AUC",

names=c("GLM","LDA","QDA"))

boxplot(auc.knn, auc.glm, auc.lda, auc.qda,

main="Overall CV AUC",

names=c("kNN","GLM","LDA","QDA"))

###############################################################

### Exercise 6: Variable importance from RF

###############################################################

library(ISLR)

library(randomForest)

# ?Carseats

High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))

CS = data.frame(Carseats, High)

CS$Sales = NULL

# grow a forest:

rf.out = randomForest(High~., CS)

# compare to bagging:

bag.out = randomForest(High~., CS, mtry=(ncol(CS)-1))

cbind(rf.out$importance, bag.out$importance)

par(mfrow=c(1,2))

varImpPlot(rf.out, pch=15, main="Ensemble method 1")

varImpPlot(bag.out, pch=15, main="Ensemble method 2")

###############################################################

### Exercise 7: gradient boosting

###############################################################

library(ISLR) # contains the dataset

library(tree) # contains... tree-building methods

# install.packages('gbm')

library(gbm)  # contains the GB model implementation

library(pROC)

# Recode response variable so as to make it a classification problem

High = as.factor(ifelse(Carseats$Sales<=8, "No", "Yes"))

CS = data.frame(Carseats, High)

# remove Sales from data frame, just to make formulas simpler!

CS$Sales = NULL 

set.seed(1)

itest = sample(1:nrow(CS), round(nrow(CS)/4))

CS.test = CS[itest,]

CS = CS[-itest,]

set.seed(4061)

# Note:

# gbm(High~., data=CS, distribution="bernoulli")

# so we must recode the levels...

CS$High = (as.numeric(CS$High=="Yes")) # this hack could be useful

gb.out = gbm(High~., data=CS,

distribution="bernoulli", # use "gaussian" instead for regression

n.trees=5000, # size of the ensemble

interaction.depth=1) # depth of the trees, 1 = stumps

summary(gb.out$train.error)

# inspect output:

par(mar=c(4.5,6,1,1))

summary(gb.out, las=1)

plot(gb.out, i="Price")

plot(gb.out, i="ShelveLoc")

plot(gb.out)

gb.p = predict(gb.out, newdata=CS.test, n.trees=5000)

gb.p

roc.gb = roc(response=CS.test$High, predictor=gb.p)

plot(roc.gb)

roc.gb$auc

# compare AUC's with a Random Forest:

library(randomForest)

CS$High = as.factor(CS$High)

rf.out = randomForest(High~., CS, ntree=5000)

# fitted values for "training set"

rf.p = predict(rf.out, CS.test, type="prob")[,2]

roc.rf = roc(response=CS.test$High, predictor=rf.p)

plot(roc.rf, add=TRUE, col=2)

roc.gb$auc

roc.rf$auc

###############################################################

### Exercise 8: gradient boosting using caret...

###############################################################

# Plug in the following snips of code within demo code

# Section4b_demo_using_caret.R:

############ (I) Classification example ############

### Gradient boosting (using caret) for classification

rm(list=ls()) #聽clear the environment

library(ISLR) # contains the data

library(caret) # contains everything else

set.seed(4061) # for reproducibility

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

n = nrow(dat)

NC = ncol(dat)

# Change the response variable to a factor to make this a

# classification problem:

dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),

"High","Low"))

# Data partition

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,] # independent validation set for later

# x = select(dat.train,-"Salary") ###聽if using dplyr

# training set:

x = dat.train

x$Salary = NULL

y = as.factor(dat.train$Salary)

gb.out = train(Salary~., data=dat.train, method='gbm', distribution='bernoulli')

gb.fitted = predict(gb.out) # corresponding fitted values

gb.pred = predict(gb.out, dat.validation)

confusionMatrix(reference=dat.validation$Salary, data=gb.pred,

mode="everything")

############ (II) Regression example ############

### Gradient boosting (using caret) for regression

rm(list=ls()) #聽clear the environment

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

# hist(dat$Salary)

dat$Salary = log(dat$Salary)

n = nrow(dat)

NC = ncol(dat)

# Data partition

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,]

x = dat.train

x$Salary = NULL

y = dat.train$Salary

ytrue = dat.validation$Salary

gb.out = train(Salary~., data=dat.train, method='gbm', distribution='gaussian')

gb.fitted = predict(gb.out) # corresponding fitted values

gb.pred = predict(gb.out, dat.validation)

mean((gb.pred-ytrue)^2)

©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 204,530评论 6 478
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 86,403评论 2 381
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 151,120评论 0 337
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 54,770评论 1 277
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 63,758评论 5 367
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,649评论 1 281
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,021评论 3 398
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,675评论 0 258
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 40,931评论 1 299
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,659评论 2 321
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,751评论 1 330
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,410评论 4 321
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,004评论 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,969评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,203评论 1 260
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 45,042评论 2 350
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,493评论 2 343

推荐阅读更多精彩内容

  • r文件下载链接[https://pan.baidu.com/s/1JCqFXY1X7XXU8s57xlRD7Q] ...
    woaishangxue阅读 284评论 0 0
  • 16宿命:用概率思维提高你的胜算 以前的我是风险厌恶者,不喜欢去冒险,但是人生放弃了冒险,也就放弃了无数的可能。 ...
    yichen大刀阅读 6,032评论 0 4
  • 公元:2019年11月28日19时42分农历:二零一九年 十一月 初三日 戌时干支:己亥乙亥己巳甲戌当月节气:立冬...
    石放阅读 6,870评论 0 2
  • 昨天考过了阿里规范,心里舒坦了好多,敲代码也犹如神助。早早完成工作回家喽
    常亚星阅读 3,031评论 0 1
  • 三军可夺气,将军可夺心。是故朝气锐,昼气惰,暮气归。善用兵者,避其锐气,击其惰归,此治气者也。以治待乱,以静待哗,...
    生姜牛奶泡腾片阅读 1,569评论 0 1