本文摘自:R 语言主成分分析(PCA)实战教程
方便个人学习和查阅
安装依赖:
install.packages("FactoMineR")
install.packages("factoextra")
library("FactoMineR")
library("factoextra")
数据准备:
# 来自factoextra包的decathlon2演示数据集,数据集如下:
data(decathlon2)
head(decathlon2)
# pca前,先进行标准化:标准偏差1,平均值为零
# FactoMineR 中,PCA之前会自动标准化数据
decathlon2.active <- decathlon2[1:23, 1:10]
decathlon2.active[, 1:6]
res.pca <- PCA(decathlon2.active, graph = FALSE)
PCA(decathlon2.active) # 显示图
一、变量分析
var <- get_pca_var(res.pca)
1. 相关曲线作图
var$coord
fviz_pca_var(res.pca, col.var = "black")
2. 代表质量作图
var$cos2
corrplot展示各变量对各主成分的代表质量
library("corrplot")
# is.corr表示输入的矩阵不是相关系数矩阵
corrplot(var$cos2, is.corr=FALSE)
各变量对一二主成分的代表质量柱形图(通过值的叠加显示)
fviz_cos2(res.pca, choice = "var", axes = 1:2)
各变量相关图,颜色代表代表质量
fviz_pca_var(res.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
3. 变量对主成分的贡献作图
var$contrib
corrplot展示每个变量对每个主成分的贡献
library("corrplot")
corrplot(var$contrib, is.corr=FALSE)
各变量对第一主成分的贡献
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)
各变量对第二主成分的贡献
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)
各变量对第一二主成分的总贡献
fviz_contrib(res.pca, choice = "var", axes = 1:2, top = 10)
各变量相关图,颜色展示贡献度
fviz_pca_var(res.pca, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)
二、观测值分析
factoextra包中的get_pca_ind()提取个体坐标,相关性,cos2 和贡献率
ind <- get_pca_ind(res.pca)
ind
1. 观测值坐标图
fviz_pca_ind(res.pca)
2. 观测值坐标图,cos2着色
ind$cos2
fviz_pca_ind(res.pca, col.ind = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
3. 观测值坐标图,cos2着色,cos2大小
fviz_pca_ind(res.pca, col.ind = "cos2", pointsize = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
4. 观测值柱形图,cos2代表质量
fviz_cos2(res.pca, choice = "ind")
5. 观测值柱形图,contrib贡献
fviz_contrib(res.pca, choice = "ind", axes = 1:2)
三、自定义观测值作图
1. 数据准备
head(iris)
iris.pca <- PCA(iris[,-5], graph = FALSE)
PCA(iris[,-5])
2. PCA展示,添加椭圆,自定义颜色
fviz_pca_ind(iris.pca,
# 只显示点而不显示文本,默认都显示
geom.ind = "point",
# 设定分类种类
col.ind = iris$Species,
# 设定颜色
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
# 添加椭圆 Concentration ellipses
addEllipses = TRUE,
legend.title = "Groups",
)
3. PCA展示,添加椭圆,分组颜色
fviz_pca_ind(iris.pca,
label = "none", # hide individual labels
habillage = iris$Species, # color by groups
addEllipses = TRUE, # Concentration ellipses
palette = "jco" # jco(临床肿瘤学杂志)调色板
)
4. PCA展示,添加多边形,分组颜色
fviz_pca_ind(iris.pca, geom.ind = "point",
col.ind = iris$Species, # color by groups
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
# 用凸包多边形代替椭圆
addEllipses = TRUE, ellipse.type = "convex",
legend.title = "Groups"
)
四、观测量和变量的biplot(双标图)
biplot 展示了两方面内容:根据前两个主成分,每个观测的得分;根据前两个主成分,每个变量的载荷。
1. PCA biplot
fviz_pca_biplot(res.pca, repel = TRUE,
col.var = "#2E9FDF", # Variables color
col.ind = "#696969" # Individuals color
)
2. PCA biplot,添加椭圆
fviz_pca_biplot(iris.pca, repel = TRUE,
# 观测量颜色
col.ind = iris$Species, palette = "jco",
# 添加椭圆
addEllipses = TRUE, label = "var",
# 线条颜色
col.var = "black",
legend.title = "Species")
3. PCA biplot,添加椭圆,点大小
fviz_pca_biplot(iris.pca,
# Fill individuals by groups
geom.ind = "point",
# 点的形状
pointshape = 21,
# 点的大小
pointsize = 2.5,
# 按照组类特定形状
fill.ind = iris$Species,
col.ind = "black",
# Color variable by groups
# 颜色
col.var = factor(c("sepal", "sepal", "petal", "petal")),
# 标题
legend.title = list(fill = "Species", color = "Clusters"),
repel = TRUE # Avoid label overplotting
)+
ggpubr::fill_palette("jco")+ # Indiviual fill color
ggpubr::color_palette("npg") # Variable colors