2022-04-12-2

####Section 7 Feature Selection ####

library(ISLR)

library(leaps) # contains regsubsets()

#查看哪些变量重要,那些变量不需要

####Exercise 1: best subset selection ####

rm(list=ls())

Hitters = na.omit(Hitters)

dim(Hitters)

# (1) 执行最佳子集选择

reg.full = regsubsets(Salary~., data=Hitters, method="exhaustive")

# method=c("exhaustive","backward", "forward", "seqrep"),

# method="exhaustive"穷举 是最优的

#(2)解释标准 regsubsets() 输出。

names(summary(reg.full))

summary(reg.full) #有*表示要这个变量;没*就不要这个变量

summary(reg.full)$which      # 追踪穷举过程

summary(reg.full)$outmat    # 追踪穷举过程

#举例最后的结果是:

#y~x1+x5

#y~x2+x3+x4

# (3)将 RSS 绘制为变量数量的函数

RSS = summary(reg.full)$rss

plot(RSS, pch=20, t='b', xlab="Number of covariates", ylab='RSS')#残差平方和

# 随着Number of covariates的增大,RSS依旧不能趋于稳定,那么我们要换adjusted R square调整后的 r 平方作为判断标准

R2adj = summary(reg.full)$adjr2 #调整后的 r 平方

plot(R2adj, pch=20, t='b', xlab="Number of covariates",

    ylab='Adjusted R^2')

#r 相关系数

#增加自变量的个数时,判定系数就会增加,

#即随着自变量的增多,R平方会越来越大,

#会显得回归模型精度很高,有较好的拟合效果。而实际上可能并非如此,

#有些自变量与因变量(即预测)完全不相关,

#增加这些自变量,并不会提升拟合水平和预测精度。为避免这种现象

#因此需要调整后的R平方

# (4)将最终模型的所需大小增加到 19,并找出哪个子模型有:

#最小的 RSS

#最高调整后的 R2

reg.full = regsubsets(Salary~., data=Hitters, nvmax=19)

par(mfrow=c(1,2))

RSS = summary(reg.full)$rss

plot(RSS, pch=20, t='b', xlab="Number of covariates", ylab='RSS')

#到10的时候才开始趋于稳定,RSS不好

R2 = summary(reg.full)$rsq      #调整前的 r 平方

R2adj = summary(reg.full)$adjr2  #调整后的 r 平方

plot(R2, pch=20, t='b', xlab="Number of covariates",

    ylab='Original and adjusted R^2')

points(R2adj, col=4, pch=15, t='b')

# 找到最优值,也就是最好有几个变量

R2adj.index = which.max(R2adj)

# 把这个值画在图里

abline(v = R2adj.index)

#11个变量

#提取找到对应的模型:

summary(reg.full)$outmat[R2adj.index,]

mod = summary(reg.full)$which[R2adj.index,]#提取需要的

names(which(mod))

# 画对应的heatmap:

plot(reg.full, scale="adjr2")

summary(reg.full)$outmat

# 与上面的追踪相对应

#0.5为调整后的R平方的临界值,

#如果调整后的R平方小于0.5,则要分析我们所采用和未采用的自变量。

#如果调整后的R平方与R平方存在明显差异,

#则意味着所用的自变量不能很好的测算因变量的变化,

#或者是遗漏了一些可用的自变量

#调整后的R平方与R平方间差距越大,模型的拟合越差。

#### Exercise 2: fwd/bwd subset selection ####

#1. 使用 jumps::regsubsets() 执行向前和向后选择。

#2. 比较 RSS 和 Adjusted R2 w.r.t 的图。 模型尺寸。

#3. 比较通过每种方法选择的 4 变量模型中的协变量。

#4. 以任何其他方式探索最终的模型选择

rm(list=ls())

Hitters = na.omit(Hitters)

dim(Hitters)

#(1)使用两种方法

reg.fwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="forward")

reg.bwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="backward")

#(2)比较

par(mfrow=c(1,2))

#画rss图

plot(summary(reg.bwd)$rss, t='b', pch=20, cex=1.5)

points(summary(reg.fwd)$rss, t='b', pch=15, col=2)

#画adjr2图

plot(summary(reg.bwd)$adjr2, t='b', pch=20, cex=1.5)

points(summary(reg.fwd)$adjr2, t='b', pch=15, col=2)

#最优值

R2adj.fwd.index = which.max(summary(reg.fwd)$adjr2)

R2adj.bwd.index = which.max(summary(reg.bwd)$adjr2)

abline(v=c(R2adj.bwd.index, R2adj.fwd.index), col=c(1,2))

#(3)找每个模型中有4个变量的模型

# 4-variable models:

#id:变量个数

coef(reg.fwd, id=4)

coef(reg.bwd, id=4)

# 但最好的还是"exhaustive"

reg.full = regsubsets(Salary~., data=Hitters, method="exhaustive")

names(which(summary(reg.full)$which[4,]==TRUE))

# 从反向消除过程中提取最优模型:

coef(reg.bwd, id=R2adj.bwd.index)

#### Exercise 3: generate predictions from regsubsets() output?####

#但是在Exercise 5里,加了两行代码就能用了

rm(list=ls())

dat = na.omit(Hitters)

n = nrow(dat)

itrain = sample(1:n, 150)

reg.fwd = regsubsets(Salary~., data=dat, nvmax=10,

                    method="forward", subset=itrain)

#predict(reg.fwd, newdata=dat[-itrain,])

#predict不能用,得自己写predict

beta.hat = coef(reg.fwd, id=4)   

# create matrix X:

test.dat = model.matrix(Salary~., data = dat[-itrain,])

# get the test data matrix:

Xtest = test.dat[,names(beta.hat)]

# compute (X Beta^T) as in Y = (X Beta^t) + Epsilon:

pred = Xtest %*% beta.hat 

pred = as.numeric(pred)    # make this a vector instead

# compute prediction RMSE:

sqrt( mean((pred - dat$Salary[-itrain])^2) )

####Exercise 4: fit and predict using stats::step() ####

#使用 R 的基本包 stats 中的函数 step() 对完整的 Hitters 数据集执行逐步选择(一旦删除了缺失值的观察,如练习 2 中所示)。

#1. 比较 step() 和 regsubsets() 的后向选择输出。

#2. 使用 step() 引用通过反向选择选择的最终模型的模型拟合摘要。

#step : 在逐步算法中通过 AIC 选择模型

rm(list=ls())

Hitters = na.omit(Hitters)

dim(Hitters)

# 从 regsubsets() 中逐步选择:

reg.fwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="forward")

reg.bwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="backward")

# 来自 step() 的逐步选择:

lm.out = lm(Salary~., data=Hitters) # full model

step.bth = step(lm.out, direction="both")#默认方法

step.fwd = step(lm.out, direction="forward")

step.bck = step(lm.out, direction="backward")

# compare backward selections from regsubsets() and step():

#

# ... from step()...

coef(step.bck)

length(coef(step.bck))

summary(step.bck)

# Nice: we get the model directly! No need to reconstruct

# fitted values by hand this time!

#

# ... from regsubsets()...

i.opt = which.min(summary(reg.bwd)$bic)

i.set.opt = summary(reg.bwd)$which[i.opt,]

summary(reg.bwd)$which[i.opt, i.set.opt]

# Different models, but smaller one is included in larger one.

# Difference is likely due to using different criteria

# (AIC v BIC, BIC yielding a smaller model).

# NB: we can also assess feature contributions in terms

# of magnitude of their effect:

coefs = abs(coef(step.fwd))/sum(abs(coef(step.fwd)), na.rm=TRUE)*100

coefs = coefs[-1]

coefs[order(abs(coefs), decreasing=TRUE)]

#从大到小每个变量占重要性(1)的比率,后面应该加 %

####Exercise 5: generate predictions from step() output? ####

rm(list=ls())

dat = na.omit(Hitters)

n = nrow(dat)

itrain = sample(1:n, 150)

lmo = lm(Salary~., data=dat, subset=itrain)#这是新添加的

reg.fwd = step(lmo, direction="forward")#这是新添加的

pred = predict(reg.fwd, newdata=dat[-itrain,])

sqrt(mean((pred - dat$Salary[-itrain])^2))

#### 带插入符号的逐步(逻辑)回归 #####

rm(list=ls())

dat = na.omit(Hitters)

n = nrow(dat)

dat = dat[sample(1:n,n),]

dat$y = as.factor(dat$Salary>mean(dat$Salary))#是否大于均值

dat$Salary = NULL

levels(dat$y) = c("low","hi")

# 分层拆分为训练+测试数据集

itrain = createDataPartition(dat$y, p=.75, times=1)[[1]]

dtrain = dat[itrain,]

dtest = dat[-itrain,]

trC = trainControl(method="cv", number=5,

                  savePredictions = TRUE,

                  classProbs = TRUE)

co = train(y~., data=dtrain, method='glmStepAIC',

          trControl=trC, distribution='binomial')

summary(co$finalModel)

names(co)

preds <- predict(co, dtest)

probs <- predict(co, dtest, type="prob")

table(dtest$y,preds)

pd = data.frame(obs=dtest$y,pred=preds,low=probs$low)

twoClassSummary(pd, lev=levels(dtest$y))

#### on simulated data ####

rm(list=ls())

library(glmnet)

library(leaps)

library(caret)

# We start by creating a dummy data set, so that we know which

# features actually make up the observations. Here a linear

# combination of nonlinear expression of 3 features are used

# to create observations Y.

# The 3 features are:

# - education level

# - number of years of experience

# - some employee rating assessed by their company

# The response variable Y is salary.

# Another 20 features containing random noise are also created

# and added to the dataset. They should not be selected by our

# model selection method...

n = 500 # desired sample size

# level of education (1=secondary level, 2=BSc, 3=MSc, 4=PhD/MBA):

edu = sample(c(1:4), size=n, prob=c(.05,.5,.35,.1), replace=TRUE)

# nbr years experience:

yex = rpois(n=n, lambda=6)

# some obscure employee rating made up by the company:

ert = pmin(5, pmax(0, 5-rexp(n=n, rate=2)))

# employee salary (response variable):

sal = 2*exp(.15*yex) + 3.2*log(edu) + 4*ert

par(mfrow=c(2,2))

plot(factor(edu), main="education")

hist(yex, main="years experience")

hist(ert, main="employee rating")

hist(sal, main="Salaries")

par(mfrow=c(1,3), pch=20)

boxplot(sal~factor(edu), main="salary wrt\n education")

plot(yex, sal, main="salary v\n years experience")

plot(ert, sal, main="salary v\n employee rating")

# now make up some dummy features...

# we don't bother changes scales/means since we will

# be normalizing these features...

p = 20

xtra = matrix(rnorm(n*p), ncol=p, nrow=n)

colnames(xtra) = paste("X",c(1:p),sep="")

par(mfrow=c(4,5), pch=20, mar=c(1,1,1,1))

for(j in 1:p){

  plot(xtra[,j], sal, main=paste(j))

}

# the data frame(s):

features = data.frame(edu,yex,ert,xtra)

dat = data.frame(sal,features) # may be more convenient sometimes

# train-test split:

i.train = sample(1:n, size=300, replace=FALSE)

# line up data in several formats for convenience:

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

x.train = features[i.train,]

y.train = sal[i.train]

x.test = features[-i.train,]

y.test = sal[-i.train]

# not forgetting matrix forms for the likes of glmnet:

xm = model.matrix(sal~.,data=features)[,-1]

xm.train = xm[i.train,]

xm.test = xm[-i.train,]

#### check out what LASSO would tell us ####

lasso.cv = cv.glmnet(xm.train, y.train)

lasso = glmnet(xm.train, y.train, lambda=lasso.cv$lambda.min)

coef(lasso)

# c.lasso = caret::train(x.train, y.train, method="glmnet")

# how about cross-validating this?

K = 10

folds = cut(1:n, breaks=K, labels=FALSE)

sel = matrix(0, nrow=K, ncol=ncol(features))

colnames(sel) = names(features)

for(k in 1:K){

  itr = which(folds!=k)

  lasso.cv = cv.glmnet(xm[itr,], sal[itr])

  lasso = glmnet(xm[itr,], sal[itr], lambda=lasso.cv$lambda.min)

  isel = which(coef(lasso)[-1] != 0)

  sel[k,isel] = 1

}

apply(sel,2,mean)*100

# LASSO thinks X1 and X14, for example, are important...

# We'd be better off increasing the regularization parameter,

# e.g. using lasso.cv$lambda.min*2 instead (try it!).

#### perform FS with caret::rfe based on linear regression ####

subsets <- c(1:5, 10, 15, 20, ncol(features))

ctrl <- rfeControl(functions = lmFuncs,

                  method = "cv",

                  number = 10,

                  # method = "repeatedcv",

                  # repeats = 5,

                  verbose = FALSE)

lm.rfe <- rfe(x.train, y.train,

              sizes = subsets,

              rfeControl = ctrl)

lm.rfe

# This function has picked the correct subset of features

#### compare with leaps...####

reg.bwd = regsubsets(sal~., data=dat.train,

                    nvmax=ncol(features))

opt.mod = which.max(summary(reg.bwd)$adjr2)

isel = which(summary(reg.bwd)$which[opt.mod,-1])

isel

# how about cross-validating this?

K = 10

folds = cut(1:n, breaks=K, labels=FALSE)

sel = matrix(0, nrow=K, ncol=ncol(features))

colnames(sel) = names(features)

for(k in 1:K){

  itr = which(folds!=k)

  reg.bwd = regsubsets(sal~., data=dat[itr,],

                      nvmax=ncol(features))

  opt.mod = which.max(summary(reg.bwd)$adjr2)

  isel = which(summary(reg.bwd)$which[opt.mod,-1])

  sel[k,isel] = 1

}

apply(sel,2,mean)*100

# X1 and X14, again...


#### perform FS with caret::rfe based on RF ####

subsets <- c(1:5, 10, 15, 20, ncol(features))

ctrl <- rfeControl(functions = rfFuncs,

                  method = "cv",

                  number = 10,

                  # method = "repeatedcv",

                  # repeats = 5,

                  verbose = FALSE)

rf.rfe <- rfe(x.train, y.train,

              sizes = subsets,

              rfeControl = ctrl)

rf.rfe

# worth the wait!

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Section 8: Unsupervised learning Use(s) of PCA...####

# Use(s) of PCA...

#

#主成分分析(Principal Component Analysis,PCA), 是一种统计方法。只做分类,根据几个X决定Y是哪一类

#通过正交变换将一组可能存在相关性的变量转换为一组线性不相关的变量,转换后的这组变量叫主成分

#k均值聚类算法(k-means clustering algorithm)是一种迭代求解的聚类分析算法,

#其步骤是,预将数据分为K组,则随机选取K个对象作为初始的聚类中心,然后计算每个对象与各个种子聚类中心之间的距离,

#把每个对象分配给距离它最近的聚类中心。聚类中心以及分配给它们的对象就代表一个聚类。每分配一个样本,

#聚类的聚类中心会根据聚类中现有的对象被重新计算。这个过程将不断重复直到满足某个终止条件。

#终止条件可以是没有(或最小数目)对象被重新分配给不同的聚类,没有(或最小数目)聚类中心再发生变化,误差平方和局部最小。

# Section 8: Unsupervised learning

# Use(s) of PCA...

#PCA的原理:don't make decisions based on y but on x's in particular

#PCA的目的:filter out some variables in the original domain.

#PCA的缺点:your features become compulsive features of all input features

#PCA里为什么要scale:

#after scaling,the matrix formed by all x becomes correlation matrix instead of covariance Matrix

#redistribute the information as to maximise the variance

#rearranging data so as to extract most of the information early on.

#pca里的数,及他所占的百分比是怎么来的

#for the matrix formed by all x, every xi has its own enginvalue lambda i,

#the its value is sqrt(lambda i) / sum(sqrt(lambda i))

#因此,我们应该通过标准化来取消数据的大小对其对应的特征值和PCA的影响

#install.packages('fdm2id')

# Section 8: Unsupervised learning

# Use(s) of PCA...

rm(list=ls())

library(caret)

library(pROC)

if(1){

  # library(mlbench) #also contains a version of the Ionosphere dataset

  # head(Ionosphere)

  library(fdm2id)

  head(ionosphere)#显示数据的前几行数(不知道有几行)

  dat = ionosphere

  class(dat[,1])

  class(dat[,ncol(dat)])

} else {

  require(mlbench)

  data(Sonar)

  dat = Sonar

}

#要符合PCA的命名逻辑

names(dat) = c(paste("X",1:(ncol(dat)-1),sep=''), "Y")#重命名每列数据,将其变为X,Y

head(dat)

y = dat[,ncol(dat)]

x = dat[,-ncol(dat)]

n = nrow(x)

p = ncol(x)

table(y)/sum(table(y))#看数据里的每种y的占比

apply(x,2,sd)

sapply(x,sd)

sapply(x,mean)

M = cor(x)#correlation

diag(M) = 0

which(abs(M)>.9)

caret::findCorrelation(cor(x))

set.seed(6041)

# itrain = sample(1:n, round(.7*n))

# do stratified sampling instead:分层抽样

itrain = createDataPartition(y, p=.7, times=1)[[1]] #do stratified sampling #做分层抽样#结果和上面那个一样

dtrain = dat[itrain,]

dtest = dat[-itrain,]

# (1) Using logistic regression for prediction...

logo = glm(Y~., dat=dtrain, family='binomial')#想看看数据的logistic regression,没啥用

summary(logo)

logp = predict(logo, newdata=dtest, "response")

summary(logp)

logh = as.factor( levels(y)[(logp>.5)+1] )

# table(logh, dtest$Y)

caret::confusionMatrix(logh, dtest$Y)

roco = pROC::roc(predictor=logp, response=dtest$Y)

roco$auc

ci.auc(roco)# 95% CI of roco$auc

# boxplot(dat$X2~dat$Y)

# (2) Using PCA for prediction...

pca = prcomp(x, scale=TRUE)

plot(pca)

plot(x[,2:3], pch=20)#随便看看两个数据的关系

biplot(pca, col=c(1,2))#有原始数据,有向量

biplot(pca, col=c(0,2))#只有向量,没有原始数据

abline(h=0, v=0, col=8, lty=1)#画坐标轴

j = which(summary(pca)$importance[3,]>.85)[1]####只要PCA里前85%的数据####

names(pca)

xt = pca$x[,1:j]

# create reduced data frame

pca.dat = data.frame(xt,Y=y)

pca.dtrain = pca.dat[itrain,]

pca.dtest = pca.dat[-itrain,]

#把前85%的数据取出来,在做logistics regression

pca.logo = glm(Y~., dat=pca.dtrain, family='binomial')

summary(pca.logo)

pca.logp = predict(pca.logo, newdata=pca.dtest, "response")

pca.logh = as.factor( levels(y)[(pca.logp>.5)+1] )

caret::confusionMatrix(logh, dtest$Y)#PAC前的数据#Accuracy : 0.8173

caret::confusionMatrix(pca.logh, dtest$Y)#Accuracy : 0.7885 #没有变低很多

roco = pROC::roc(predictor=logp, response=dtest$Y)#0.7856

pca.roco = pROC::roc(predictor=pca.logp, response=dtest$Y)#0.8302

roco$auc

pca.roco$auc

ci.auc(roco)

ci.auc(pca.roco)

################################说用caret包,没说干啥,也不分析

set.seed(6041)

trC = trainControl(method="boot", number=10,

                  # method="cv", number=10,repeat(重复几遍)

                  savePredictions = "all",

                  classProbs = TRUE)#bootstrapping 也可以cv

modo = caret::train(x, y, family='binomial', method="glm", trControl=trC)

modo$results

pca.modo = caret::train(xt, y, family='binomial', method="glm", trControl=trC)

pca.modo$results

extract.caret.metrics <- function(co){

  # co: caret output

  ids = unique(co$pred$Resample)

  K = length(ids)

  aucs = accs = numeric(K)

  for(k in 1:K){

    sk = subset(co$pred,Resample==ids[k])

    l = co$levels[1]

    aucs[k] = roc(response=sk$obs, pred=sk[,l], quiet=TRUE)$auc

    tb = table(sk$obs, sk$pred)

    accs[k] = sum(diag(tb))/sum(tb)

  }

  return(list(aucs=aucs, accs=accs))

}

# compare CV AUCs and accuracies:

eo = extract.caret.metrics(modo)

c(mean(eo$aucs), mean(eo$accs))#PCA前AUC和accuracy

pca.eo = extract.caret.metrics(pca.modo)

c(mean(pca.eo$aucs), mean(pca.eo$accs))#PCA后AUC和accuracy

# reporting mean and SE instead:

c(mean(eo$aucs),sd(eo$aucs))#PCA前AUC和accuracy

c(mean(pca.eo$aucs),sd(pca.eo$aucs))#PCA后AUC和accuracy

################################说用caret包,没说干啥,也不分析

# (2) Using PCA to cluster features...聚类

biplot(pca, col=c(0,2))

abline(h=0, v=0, col=8, lty=1)

# reproduce plot of projeted features:

pca.biplot <- function(pca,cols=2,a=1,b=2){

  plot(pca$rotation[,c(a,b)],pch='')

  abline(h=0,lty=3,col=1,lwd=.5)

  abline(v=0,lty=3,col=1,lwd=.5)

  nms = row.names(pca$rotation)

  text(pca$rotation[,c(a,b)],labels=nms,col=cols)

}

pca.biplot(pca,cols=2,a=1,b=2)#只要向量的点上有向量名字,不要向量线

#loading plot(载荷)

#里面的数表示它作为第i主成分时,其对于平均水平的相对程度,正数表示平均水平以上,负数表示平均水平以下

abline(h=0, v=0, col=8, lty=1)

#如果两个名字太近,don't need them all together,we can cluster on that.聚类

C = 5 #被选作集群的点数 center,也是k

C = 10

x.proj = pca$rotation[,1:2]

#ko = kmeans(pca$rotation[,1:2], centers=5)

# run on first two components 要前两个成分

#take five clusters 采取五个集群

ko = kmeans(x.proj, centers=C)

points(ko$centers, pch=20, cex=3)#看哪C个点被选作集群(选择是随机的)

#how they group up in the projected space, do clustering.

cbind(ko$cluster)#看每个点都被分为了哪个集群

table(ko$cluster)#每个集群都有几个点

# pay attention what you cluster though! 所有PCA都要

ko.all = kmeans(pca$rotation, centers=C)

points(ko.all$centers[,1:2], pch=15, cex=3, col=4)

cbind(ko$cluster, ko.all$cluster) # watch out, cluster numbers are random!

#see the number of clusters OK for each cluster,picking identifying which features are contained in the cluster.

pca.clust.set = numeric(C)

#looking at which picture is the closest to the center

for(ic in 1:C){

  fs = which(ko$cluster==ic)

  centroid = ko$centers[ic,]

  dc = NULL

  for(ifs in fs){

    dc = c(dc, sqrt(mean((x.proj[ifs,]-centroid)^2)))

  }

  pca.clust.set[ic] = fs[which.min(dc)]#the closest feature from the centre for each cluster

}

x.ss = x[,pca.clust.set]

set.seed(6041)

pca.cl.modo = caret::train(x.ss, y, family='binomial', method="glm", trControl=trC)

pca.cl.modo$results

pca.cl.eo = extract.caret.metrics(pca.cl.modo)

c(mean(eo$aucs), mean(eo$accs))#auc 和 accuracy

c(mean(pca.eo$aucs), mean(pca.eo$accs))

c(mean(pca.cl.eo$aucs), mean(pca.cl.eo$accs))

# reporting mean and SE instead:

c(mean(eo$aucs),sd(eo$aucs))#AUC及其sd

c(mean(pca.eo$aucs),sd(pca.eo$aucs))

c(mean(pca.cl.eo$aucs),sd(pca.cl.eo$aucs))

#### Another example...in PCA####

library(ISLR)

dim(Hitters)

dat = na.omit(Hitters)

n = nrow(dat)

set.seed(4060)

dat = dat[sample(1:n, n, replace=FALSE),]

i.train = 1:round(.7*n)

dat.train = dat[i.train,]

dat.valid = dat[-i.train,]#test group

# (a) Scaled PCA:

#这里有很多x,这里只选取前6个

set.seed(1)

pca = prcomp(dat.train[,1:6], scale=TRUE)

summary(pca)

# 4 PCs are necessary to capture over 86% of the information

# (b) k-means:

set.seed(1)

ko = kmeans(dat.train[,1:6], 4)#4个集群

table(ko$cluster)

par(font=2, font.axis=2, font.lab=2)

dims = c(1,3) #用x1和x3作图,这里随便谁都行

plot(dat.train[,dims], pch=20,  col=c(2,4,3,5)[ko$cluster], cex=2)

#对上面的集群作图

points(ko$centers[,dims], pch='*',  col=1, cex=4)

#这里没解释这四个点标出来是干什么的

#根据图感觉应该是每个集群的中心点

#can see clearly that there's 4 groups

#

# From figure:

# Stars indicate the centroids of each cluster

# AtBat contributes more because of scale effect:

# 从图中:

# 星号表示每个簇的质心

# 由于规模效应,AtBat 贡献更大:(AtBat的sd最大)

sapply(dat.train[,1:6],sd)

# (b) k-means:

set.seed(1)

#对PCA进行k-means

pca.ko = kmeans(pca$x, 4)

table(pca.ko$cluster)

par(font=2, font.axis=2, font.lab=2)

dims = c(1,3)

plot(dat.train[,dims], pch=20,  col=c(2,4,3,5)[pca.ko$cluster], cex=2)

#和之前的图差别挺大的,也没分析为啥

###1:02讲完了

#### Section 8: miz Imbalance...####

# Imbalance...

#imbalance 就是当数据里绝大部分都是A,只有很少是B时

#比如No占0.9667,而 Yes 只有0.0333

#很多Yes被识别为No

#Sensitivity和Specificity中有一个数会非常小

#处理imbalance(对数据进行加权,让yes的比例变大) 以后

#准确率下降了,但Yes的识别率显著提高

#Sensitivity和Specificity都很大

#

#Confusion Matrix and Statistics

#Reference

#Prediction  No  Yes

#No          TN  FN

#Yes          FP  TP

#True/FALSE

#Positive/Negative

#FP:failed alarm

#FN:false detection

#Sensitivity:TP/(TP+FN)

#Specificity=TN/(TN+FP)

rm(list=ls())

library(ISLR)

head(Default)

tb = table(Default$default)

tb/sum(tb)

dat = Default

set.seed(4061)

itrain = createDataPartition(dat$default, p=.7, times=1)[[1]]

dtrain = dat[itrain,]

dtest = dat[-itrain,]

trC = trainControl(method="boot", number=50,

                  # method="cv", number=10,

                  savePredictions = "all",

                  classProbs = TRUE)

# (1) Using logistic regression for prediction...

set.seed(6041)

x = dtrain

x$default = NULL

y = dtrain$default

modo = caret::train(x, y, family='binomial', method="glm", trControl=trC)

modo$results

preds = predict(modo, dtest)

confusionMatrix(preds, dtest$default)#很多Yes被识别为No

#Sensitivity和Specificity中有一个数会非常小

confusionMatrix(preds, dtest$default, positive=levels(dtest$default)[2])

# positive=levels(dtest$default)[2]:  从positive=yes变成positive=No

#Accuracy和matrix不变,Sensitivity和Specificity数值置换了

levels(dtest$default)

# (2) Using weighted logistic regression for prediction...

tbt = table(dtrain$default)

ws = rev( as.numeric(tbt/sum(tbt)) )

w = numeric(nrow(dtrain))

l = levels(dtrain$default)

w[which(dtrain$default==l[1])] = ws[1]

w[which(dtrain$default==l[2])] = ws[2]

#modo = caret::train(x, y, family='binomial', method="glm", trControl=trC)

modo.w = caret::train(x, y, family='binomial', method="glm", trControl=trC, weights=w)

modo.w$results

preds.w = predict(modo.w, dtest)

confusionMatrix(preds.w, dtest$default, positive=levels(dtest$default)[2])

#准确率下降了,但Yes的识别率显著提高

#Sensitivity和Specificity都很大

©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 204,445评论 6 478
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 85,889评论 2 381
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 151,047评论 0 337
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 54,760评论 1 276
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 63,745评论 5 367
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,638评论 1 281
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,011评论 3 398
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,669评论 0 258
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 40,923评论 1 299
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,655评论 2 321
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,740评论 1 330
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,406评论 4 320
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 38,995评论 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,961评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,197评论 1 260
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 45,023评论 2 350
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,483评论 2 342

推荐阅读更多精彩内容