1. 引言
(O)PLS-DA,全称为Orthogonal Partial Least Squares Discriminant Analysis,也就是正交偏最小二乘判别分析,是一种常用的多元线性回归分析方法。它被广泛应用于数据分析、模式识别和机器学习领域,特别是在生物信息学中,用于处理高维度、复杂的生物数据。
(O)PLS-DA的主要目标是找出数据中最能表现出类别差异的方向,使得同一类别的样本在新的坐标系中尽可能近,不同类别的样本尽可能远。它通过PCA(主成分分析)和PLS(偏最小二乘)的结合,建立一个既能解释X(预测变量)的方差又能最大限度地解释Y(应变量)的模型。
2. (O)PLS-DA分析
2.1 加载R包和导入数据
## 加载包
library(ropls)
## 读取数据
expr <- read.table("sample.csv", header = TRUE,sep = ",",row.names = 1)
group_info <- data.table::fread("group.csv", header = TRUE)
expr数据格式
image.png
group_info数据格式
image.png
PLS(DA)分析
2.2 因变量为离散型数据(如性别)时的PLS-DA图
基础得分图
sacurine.plsda <- opls(t(expr), group_info$gender, orthoI = 0)
plot(sacurine.plsda, typeVc = "x-score")
image.png
去除样本名并添加相应的散点
## 设置颜色,颜色是从ropls包源代码中提取出来的
color <- c("blue", "red", "green3", "cyan", "magenta", "#FF7F00", "#6A3D9A", "#B15928", "aquamarine4", "yellow4", "#A6CEE3", "#B2DF8A", "#FB9A99", "#FDBF6F", "#FFFF99")
## 提取画图数据
a <- data.frame(sacurine.plsda@scoreMN)
b <- sacurine.plsda@suppLs$y
levels_b <- sort(levels(factor(b)))
level_to_color <- setNames(color, levels_b)
color_vector <- level_to_color[b]
## 画图
rownames(sacurine.plsda@suppLs$yMCN) <- NULL
plot (sacurine.plsda, type = 'x-score',parPaletteVc = color)
## 可以选择pch来更换散点形状
points(a$p1, a$p2,col = color_vector, pch=16, cex=1)
image.png
2.3 因变量为连续型数据(如age、bmi)时的PLS图
基础图
sacurine.plsda <- opls(t(expr), group_info$bmi, orthoI = 0)
plot(sacurine.plsda, typeVc = "x-score")
image.png
去除样本名并添加相应的散点
## 设置颜色,颜色是从ropls包源代码中提取出来的
scaVc <- rev(rainbow(100, end = 4/6))
## 提取画图数据
a <- data.frame(sacurine.plsda@scoreMN)
b <- sacurine.plsda@suppLs$y
d <- cbind(a,b)
d <- d[order(d$b),]
color <- scaVc[round((d$b - min(d$b, na.rm = TRUE)) / diff(range(d$b, na.rm = TRUE)) * 99) + 1]
## 画图
level_to_color <- setNames(color, d$b)
rownames(sacurine.plsda@suppLs$yMCN) <- NULL
plot (sacurine.plsda, type = 'x-score')
points(d$p1, d$p2,col = level_to_color, pch=16, cex=1)
image.png
OPLS(DA)分析
2.4 因变量为离散型数据(如性别)时的OPLS-DA图
基础得分图
sacurine.oplsda <- opls(t(expr), group_info$gender, predI = 1, orthoI = NA, fig.pdfC = "none")
plot(sacurine.oplsda, typeVc = "x-score")
image.png
去除样本名并添加相应的散点
## 设置颜色,颜色是从ropls包源代码中提取出来的
color <- c("blue", "red", "green3", "cyan", "magenta", "#FF7F00", "#6A3D9A", "#B15928", "aquamarine4", "yellow4", "#A6CEE3", "#B2DF8A", "#FB9A99", "#FDBF6F", "#FFFF99")
## 提取画图数据
a1 <- data.frame(sacurine.oplsda@scoreMN)
a2 <- data.frame(sacurine.oplsda@orthoScoreMN)
a <- cbind(a1,a2)
b <- sacurine.oplsda@suppLs$y
levels_b <- sort(levels(factor(b)))
level_to_color <- setNames(color, levels_b)
color_vector <- level_to_color[b]
## 画图
rownames(sacurine.oplsda@suppLs$yMCN) <- NULL
plot (sacurine.oplsda, type = 'x-score',parPaletteVc = color)
points(a$p1, a$o1,col = color_vector, pch=16, cex=1)
image.png
2.5 因变量为连续型数据(如age、bmi)时的OPLS图
基础图
sacurine.oplsda <- opls(t(expr), group_info$bmi, predI = 1, orthoI = NA, fig.pdfC = "none")
plot(sacurine.oplsda, typeVc = "x-score")
image.png
去除样本名并添加相应的散点
## 设置颜色,颜色是从ropls包源代码中提取出来的
scaVc <- rev(rainbow(100, end = 4/6))
color <- scaVc[round((d$b - min(d$b, na.rm = TRUE)) / diff(range(d$b, na.rm = TRUE)) * 99) + 1]
## 提取画图数据
a1 <- data.frame(sacurine.oplsda@scoreMN)
a2 <- data.frame(sacurine.oplsda@orthoScoreMN)
a <- cbind(a1,a2)
b <- sacurine.oplsda@suppLs$y
d <- cbind(a,b)
d <- d[order(d$b),]
level_to_color <- setNames(color, d$b)
## 画图
rownames(sacurine.oplsda@suppLs$yMCN) <- NULL
plot (sacurine.oplsda, type = 'x-score')
points(d$p1, d$o1,col = level_to_color, pch=16, cex=1)
image.png