好看
https://www-bcf.usc.edu/~gareth/ISL/ISLR%20First%20Printing.pdf
https://rpubs.com/ppaquay/65561
https://www.cnblogs.com/xuancaoyy/p/5309966.html
https://blog.csdn.net/rojyang/article/details/85321244
https://blog.csdn.net/weixin_36372879/article/details/80493968
https://blog.csdn.net/yawei_liu1688/article/details/78891050
# MD
多变量/离/群/值/的查找方法,也就是异常值。用卡/方检验进行考察(0.005作显著性判别)。在r中,马氏/距离的计算仅需图t,m,s
```
d=read.csv("hmeq.csv",na.strings="")
dim(d)
View(d)
dc =d[complete.cases(d),] #remove missing values
dim(dc)
mdist = function(x) #马氏距离function定义
{
t = as.matrix(x) #将x转换为矩阵
p = dim(t)[2] #选择列
m = apply(t,2,mean) #对列求均值
s = var(t) #方差,此步即可获得协方差矩阵
return(mahalanobis(t,m,s))
}
#数据/分类
dc1 = dc[dc$BAD==1,] #数据集的第一列是定性变量 1/0
dc0 = dc[dc$BAD==0,]
#计算各分组的马氏距离
mdc1 = mdist(dc1[,-c(1,5,6)]) #去除第1,5,6列,因为不是数值型
mdc0 = mdist(dc0[,-c(1,5,6)])
c=qchisq(0.99,10) #卡方分布检验拟合度 异常值,99%的应该服从什么分布,这个临界值是多少
#10是自由度,一共13个变量,减去1,5,6列之后是10个
mdc=mdist(dc[,-c(1,5,6)])
#用马氏/距离进行筛选
x1=dc1[mdc1<c,]
x0=dc0[mdc0<c,]
#逻辑/回归
lg.fit = glm(BAD~.,data=d,family=binomial)
#第一个位置写什么是y什么是x,BAD~.表示以BAD作Y,其余为X都为x。family若不指明默认是线性回归,binomial表示是逻辑回归
summary(lg.fit$fitted.values)
#L/DA
pred1 = predict(lg.fit,type="response")
pred1[pred1>0.3] <-1 #将上一步的结果归类到以0.3为临界值的分类器中
pred1[pred1<=0.3] <-0
table(pred1,dc$BAD) #混淆/矩阵
library(pROC)
roc(dc$BAD,pred1,plot=TRUE,print.auc=TRUE,legacy.axes=TRUE)
#反应变量,predictor,画出图,标出auc面积,确保横坐标(0,1)排序
```
## Tree
```
heart <- read.csv("heart.csv",header=T,na.strings="?")
summary(heart) #无缺失值
names(heart)
heart$target[heart$target==1]<-"Yes"
heart$target[heart$target==0]<-"No"
set.seed(1234) #设定随机取数的范围
train<-sample(nrow(heart),0.7*nrow(heart)) #以3:7的比例抽样
theart<-heart[train,] #训练集
vheart<-heart[-train,] #测试集
library(rpart)
dtree<-rpart(target~.,data=theart,method="class",parms=list(split="information"))
printcp(dtree)
print(dtree)
tree<-prune(dtree,cp=dtree$cptable[which.min(dtree$cptable[,"xerror"]),"CP"]) #自动剪掉xerror最小的那个
opar<-par(no.readonly=T) #可以生成一个可以修改的当前图形参数列表
par(mfrow=c(1,2))
#install.packages("rpart.plot")
library(rpart.plot)
rpart.plot(dtree,branch=1,type=2,fallen.leaves=T,cex=0.6,sub="Before")
rpart.plot(tree,branch=1,type=4,fallen.leaves=T,cex=0.6,sub="After")
par(opar)
predtree<-predict(tree,newdata=vheart,type="class") #利用预测集进行预测
table(vheart$target,predtree,dnn=c("True","Predict")) #输出混淆矩阵
```
## Tree2
```
d=read.csv("train.csv",header=TRUE)
dc=d[complete.cases(d),]
d0=d[d$y==0,]
d1=d[d$y==1,]
d2=d[d$y==2,]
d3=d[d$y==3,]
#产生与各类别内变量数量相等的一到十的数值,为之后给数字贴标签做铺垫
label0=sample(c(1:10),dim(d0[1]),replace=TRUE)
label1=sample(c(1:10),dim(d1[1]),replace=TRUE)
label2=sample(c(1:10),dim(d2[1]),replace=TRUE)
label3=sample(c(1:10),dim(d3[1]),replace=TRUE)
d0_train=d0[label0<=5,]
d0_test=d0[label0>5,]
d1_train=d1[label1<=5,]
d1_test=d1[label1>5,]
d2_train=d2[label2<=5,]
d2_test=d2[label2>5,]
d3_train=d3[label3<=5,]
d3_test=d3[label3>5,]
d_train=rbind(d0_train,d1_train,d2_train,d3_train)
d_test=rbind(d0_test,d1_test,d2_test,d3_test)
library(nnet)
re_log=multinom(y~.-id,data=d_train)
#y~.id 除去变量id外的所有变量作为x
#这一步类似于glm,只是生成的模型是面对多种response的逻辑回归
pred_log=predict(re_log,newdata=d_test)
#按照上一步的模型跑/测试集的数据,注意data来源 前面写newdata
tab_log=table(d_test$y,pred_log)
#出现类似混淆矩阵看模型。d_test$y是只看test中的y
library(rpart)
re_id3=rpart(y~.-id,data=d_train,method="class")
#分类时选择class
re_id3_mistake=rpart(y~.-id,data=d_train)
library(RWeka)
re_id3=rpart(y~.-id,data=d_train,method="class",parms=list(split="information"))
re_CART=rpart(y~.-id,data=d_train,method="class",parms=list(split="gini"))
pred_id3=predict(re_id3,newdata=d_test)
pred_CART=predict(re_CART,newdata=d_test,type="class")
re_CART=rpart(y~.-id,data=d_train,method="class",parms=list(split="gini"),control=rpart.control(cp=0.001))
#cp默认为0.01,通过cp调整剪枝,smaller cp,more trees
table(d_test$y,pred_CART)
re_CART=rpart(y~.-id,data=d_train,method="class",parms=list(split="gini"),control=rpart.control(cp=0.0001))
#树的修剪
min=which.min(re_CART$cptable[,4])
min
#选择最小的error
re_CART_f=prune(re_CART,cp=re_CART$cptable[min,1])
table(d_test$y,pred_CART)
plot(re_CART)
#randomforest
d_train$y=as.factor(d_train$y)
re_rf=randomForest(y~.-id,data=d_train,ntree=5)
pred_rf=predict(re_rf,newdata=d_test,type="prob")
d_train$y[d_train$y>=1]=1
d_test$y[d_test$y>=1]=1
```
## Decisiontre/e
```
library(randomForest)
library(rpart)
library(rpart.plot)
heart <- read.csv("heart.csv",header=T,na.strings="?")
summary(heart)
data.index = sample(c(1,2), nrow(heart), replace = T, prob = c(0.7, 0.3))
train_data = heart[which(data.index == 1),] #训练集
test_data = heart[which(data.index == 2),]#测试集
n<-length(names(train_data))
rate=matrix()
for (i in 1:(n-1))
{
mtry=i
for(j in (1:100))
{
set.seed(1234)
rf_train=randomForest(as.factor(train_data$target)~.,data=train_data,mtry=i,ntree=j)
rate[(i-1)*100+j]=mean(rf_train$err.rate)
}
}
z=which.min(rate)
print(z)
set.seed(1234)
rf_train<-randomForest(as.factor(train_data$target)~.,data=train_data,mtry=1,ntree=500,importance=TRUE,proximity=TRUE)
importance<-importance(rf_train)
barplot(rf_train$importance[,1],main="importance")
box()
importance(rf_train,type=2)
varImpPlot(x=rf_train,sort=TRUE,n.var=nrow(rf_train$importance)) #show the importance
print(rf_train)
hist(treesize(rf_train))
max(treesize(rf_train))
MDSplot(rf_train,train_data$target,palette=rep(1,2),pch=as.numeric(train_data$target))
pred<-predict(rf_train,newdata=test_data)
pred_out_1<-predict(object=rf_train,newdata=test_data,type="prob")
table<-table(pred,test_data$target)
sum(diag(table))/sum(table)
plot(margin(rf_train,test_data$target))
```
## Bootstrap
bootstrap
1.总体数量不知道;
2.从部分样本有放回的重采样i次,将多次抽样的估计量(均值等)作为整体分布的结果
3.新的观测个数=原来的样本个数
4.新的抽样样本=自助抽样法
蒙特卡洛
1.总体数量知道
2.从中抽取一个样本(或多个),用这个抽取样本的估计量当作整体估计
```
n=100
#蒙特卡洛仿真
alpha=c()
library(MASS)
for (i in 1:100){
mu1=c(0,0)
sigma1=matrix(c(1,0.5,0.5,1.25),nrow=2)
rand1=mvrnorm(n=100,mu=mu1,Sigma=sigma1) #n随机样本个数,mu均值,sigma协方差
X=rand1[,1]
Y=rand1[,2]
alpha[i]=(var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y))
}
rand1
for (j in 1:100){
ran=rand1[sample(c(1:100),100,replace=TRUE),]
#此处c(1:100)指的是有放回的抽100次,后一个100是重复100次
X=ran[,1]
Y=ran[,2]
alpha[j]=(var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y))
}
#rand1用来储存多元正态分布新的观测值(满足分布的新值)
#ran是将rand1中的100个数,随机有放回的抽取,形成新的一组response
```
## Cross-validation
```
set.seed(1)
y=rnorm(100)
x=rnorm(100)
y=x-2*x^2+rnorm(100)
#(b)
plot(x,y)
#(c)compute LOOCV error
error=c()
d=cbind(x,y)#cbind负责数据按列进行拼接
d=as.data.frame(d)
for (i in 1:100)
{
m1=glm(y~x,data=d[-i,])#不用第i行的模型(LOOCV的特点)
pred_m1=predict(m1,newdata=d[i,])#计算这个模型在第i行的预测值
error[i]=d[i,2]-pred_m1#计算预测值和真值的误差
}
error #i可以是1-100间的任意数,所以我们得到了100个error
sum(error^2)
library(boot) #这边要加载包,不然cv函数跑不出来
m1=glm(y~x,data=d)
m1r=cv.glm(data=d,glmfit=m1,K=100)
m1r$delta
m2=glm(y~poly(x,2),data=d)
m2r=cv.glm(data=d,glmfit=m2,K=100)
m2r$delta
m3=glm(y~poly(x,3),data=d)
m3r=cv.glm(data=d,glmfit=m3,K=100)
m3r$delta
m4=glm(y~poly(x,4),data=d)
m4r=cv.glm(data=d,glmfit=m4,K=100)
m4r$delta
```
## data cleaning
```
#数据清理
#1.空值
data=data[complete.cases(data),]#去空值
data=data[!complete.cases(wine),]#显示空值
#2.去重复值
data=unique(data)
#3.查看缺失值
c=is.na(data)
#4.标记缺失值
data=read.csv("data.csv",na.string="")
```
## Clustering(k-means)& PCA
1. PCA
2. k-means
```
X <- rbind(matrix(rnorm(20*50, mean = 0), nrow = 20),
matrix(rnorm(20*50, mean=0.7), nrow = 20),
matrix(rnorm(20*50, mean=1.4), nrow = 20)) #rbind把行row结合在一起,cbind把列结合在一起
X.pca = prcomp(X)
plot(X.pca[,1:2], col=c(rep(1,20), rep(2,20), rep(3,20))) #pca,1重复20次,2重复20次。col表示print颜色
res = kmeans(X, centers = 3) #k=3表示有3类
true_class = c(rep(1,20), rep(2,20), rep(3,20))
table(res$cluster, true_class) #看行,如果集中在一个类,那就是perfectly clustered
```