1. 机器学习的训练集和验证集拆分
需要一个R包:caret
代码:
library(caret)
set.seed(12)
#按照75%的比例拆分数据集,data为原始数据框,用于拆分的列名为Name
data_index<-createDataPartition(data$Name, p=0.75)
data_train<-data[data_index$Resample1,]
data_test<-data[-data_index$Resample1,]
2. PCA降维分析
library(psych)
1) 寻找最适PCA值
parpca<-fa.parallel(data,fa="pc")
2) 假设计算出来最佳主成分个数为40,提取前40个主成分用于后续分析
ETHpca2 <- principal(ETHims,nfactors = 40)
ETHpca40 <-predict.psych(ETHpca2,ETHims)
dim(ETHpca40)
3) 画图展示前50个pc的结果
pcanum <- 50
plotdata <- data.frame(x = 1:pcanum,pc.values =parpca$pc.values[1:pcanum])ggplot(plotdata,aes(x = x,y = pc.values))+
theme_bw()+geom_point(colour ="red")+geom_line(colour ="blue")+labs(x ="Componet Number")
3. KNN分析
1) 简介
使用caret的knn3包来进行KNN分析。KNN算法原理见:https://blog.csdn.net/weixin_45014385/article/details/123618841
ETHknn <- knn3(x = train_ETH, y = train_lab, k = 5)
寻找最佳临近数:
set.seed(123)
trcl <- trainControl(method ="cv", number = 5)
trgrid <- expand.grid(k = seq(1, 25, 2))
ETHknnFit <- train(x = train_ETH, y = train_lab,method ="knn",trControl = trcl, tuneGrid = trgrid)
2) 代码示例:
library(MASS)
data(biopsy)
str(biopsy)
biopsy<-biopsy[,-1]
names(biopsy)<-c("thick","u.size","u.shape","adhsn","s.size","nucl","chrom","n.nuc",
"mit","class")
df<-na.omit(biopsy)
df$class<-factor(df$class,levels=c("benign","malignant"))
round(prop.table(table(df$class))*100,digits=1)
set.seed(123)
ind<-sample(1:2,nrow(df),replace=T,prob=c(0.7,0.3))
train<-df[ind==1,]
test<-df[ind==2,]
install.packages("class")
library(class)
install.packages("gmodels")
library(gmodels)
library(psych)
library(caret)
library(Metrics)
train_data<-train[,1:9]
train_class<-train[,10]
test_data<-test[,1:9]
test_class<-test[,10]
knn_result<-knn3(x=train_data,y=train_class,k=7)
test_pre<-predict(knn_result,test_data,type="class")
accuracy(test_class,test_pre)
#find proper k value
set.seed(123)
trcl<-trainControl(method="cv",number=5)
trgrid<-expand.grid(k=seq(1,50,2))
knnFit<-train(x=train_data,y=train_class,method="knn",trControl=trcl,tuneGrid=trgrid)
plot(knnFit,main="KNN")
4. 神经网络
1) 使用的R包:neuralnet
library(neuralnet)