恶性实体瘤组织不仅包括肿瘤细胞,还包括与肿瘤相关的正常的基质细胞,免疫细胞和血管细胞等。免疫细胞的种类很多,不同类型的免疫细胞在抗肿瘤和肿瘤免疫逃逸过程中发挥着不同的作用,与肿瘤的生长、侵袭和转移也紧密相关;基质细胞被认为在肿瘤生长、疾病进展和耐药性中起重要作用。
ESTIMATE (Estimation of STromal and Immune cells in MAlignant Tumour tissues using Expression data) 基于单样本基因集富集分析(ssGSEA),利用癌症样本转录谱计算基质和免疫评分以预测浸润基质和免疫细胞的水平。最后联合这两个得分生成ESTIMATE score (Estimate socre = Stromal score + Immune score),用于分析肿瘤纯度。
CIBERSORT和ESTIMATE的不同:
1)ESTIMATE除了免疫细胞,还能分析肿瘤细胞纯度和基质细胞的丰度;
2)ESTIMATE关于免疫细胞,仅能计算一个总的免疫细胞评分,而无法给出每种免疫细胞的具体比例。
1. 表达矩阵预处理
# 安装并加载所需的R包
# install.packages("estimate", repos="http://R-Forge.R-project.org")
library(estimate)
# 读取表达矩阵
COAD_Variant_ALL_2[1:6,1:3]
## TCGA-A6-6780-01A-11R-A278-07 TCGA-CM-5868-01A-01R-1653-07 TCGA-CM-6165-01A-11R-1653-07
## 5S_rRNA 0 0 0
## 7SK 63 0 0
## A1BG 0 1 5
## A1BG-AS1 5 6 11
## A1CF 3615 1111 1647
## A2M 14058 16785 17170
# 取cpm
dat = log2(edgeR::cpm(COAD_Variant_ALL_2)+1)
dat[1:6, 1:3]
## TCGA-A6-6780-01A-11R-A278-07 TCGA-CM-5868-01A-01R-1653-07 TCGA-CM-6165-01A-11R-1653-07
## 5S_rRNA 0.0000000 0.00000000 0.0000000
## 7SK 1.0854934 0.00000000 0.0000000
## A1BG 0.0000000 0.02556706 0.1764518
## A1BG-AS1 0.1230777 0.14701771 0.3631400
## A1CF 6.0309370 4.38296757 5.4546789
## A2M 7.9737768 8.23415019 8.8066045
write.table(dat, file = "COAD_estimate_input.txt", sep = '\t', quote = F)
exp.file = "COAD_estimate_input.txt"
in.gct.file = "ESTIMATE_input.gct"
# 将输入文件格式化为 GCT 格式
outputGCT(exp.file, in.gct.file)
rt <- read.table("ESTIMATE_input.gct",
skip = 2, # 去掉前2行后,可以看到剩下的数据为一个新的数据集矩阵
header = TRUE,
sep = "\t")
rt[1:3, 1:5]
## NAME Description TCGA.A6.6780.01A.11R.A278.07 TCGA.CM.5868.01A.01R.1653.07 TCGA.CM.6165.01A.11R.1653.07
## 1 5S_rRNA 5S_rRNA 0.000000 0.00000000 0.0000000
## 2 7SK 7SK 1.085493 0.00000000 0.0000000
## 3 A1BG A1BG 0.000000 0.02556706 0.1764518
# filterCommonGenes()函数取表达矩阵与作者筛选得到的common gene进行比对过滤
filterCommonGenes(input.f = exp.file, # 输入文件,为自己的表达矩阵
output.f = "COAD_10412genes.gct", # 定义输出到工作目录的输出文件名,后缀为gct
id = "GeneSymbol" # 我们数据集的列名为GeneSymbol,因此这里选择拿GeneSymbol进行匹配,还可选择EntrezID("common_genes"展示如下)
)
## [1] "Merged dataset includes 9618 genes (794 mismatched)."
data("common_genes")
## common_genes[1:6,1:5]
EntrezID GeneSymbol Synonyms GeneName Chromosome
## 1 2 A2M A2MD|CPAMD5|FWP007|S863-7 alpha-2-macroglobulin chr12
## 2 9 NAT1 AAC1|MNAT|NAT-1|NATI N-acetyltransferase 1 (arylamine N-acetyltransferase) chr8
## 3 10 NAT2 AAC2|NAT-2|PNAT N-acetyltransferase 2 (arylamine N-acetyltransferase) chr8
## 4 12 SERPINA3 AACT|ACT|GIG25 serpin peptidase inhibitor, clade A (alpha-1 antiproteinase, antitrypsin), member 3 chr14
## 5 13 AADAC CES5A1|DAC arylacetamide deacetylase chr3
## 6 14 AAMP - angio-associated, migratory cell protein chr2
2. ESTIMATE分析
out.score.file = "ESTIMATE_score.gct"
# 计算所有样本的肿瘤纯度、免疫细胞评分和基质细胞评分
estimateScore(in.gct.file, # 刚才过滤得到的输入文件
out.score.file, # 输出的输出文件
platform = "illumina") # TCGA数据,选择illumina
# 绘制每一个样本的肿瘤细胞纯度,以及其与公共数据库中样本比较的散点图,结果会存储在工作路径下的./estimated_purity_plots/
plotPurity(scores = out.score.file)
## 输出ESTIMATE评分
ESTIMATE_score = read.table(out.score.file,
skip = 2,
header = T,
row.names = 1)
ESTIMATE_score = as.data.frame(t(ESTIMATE_score[, 2:ncol(ESTIMATE_score)])) # 取2到最后1列的数据并进行数据转置
ESTIMATE_score[1:6, 1:3]
## StromalScore ImmuneScore ESTIMATEScore
## TCGA.A6.6780.01A.11R.A278.07 -509.02444 758.65472 249.6303
## TCGA.CM.5868.01A.01R.1653.07 -554.82205 -569.25280 -1124.0749
## TCGA.CM.6165.01A.11R.1653.07 -41.48252 -76.20347 -117.6860
## TCGA.CA.6716.01A.11R.1839.07 -667.40171 -937.01249 -1604.4142
## TCGA.D5.6930.01A.11R.1928.07 -96.60552 350.85242 254.2469
## TCGA.AY.A8YK.01A.11R.A41B.07 -1261.93920 -633.35076 -1895.2900
write.table(ESTIMATE_score, file = "COAD_estimate_score.txt", sep = '\t', quote = F)
3. 可视化展示
## 绘图
library(ggpubr)
library(ggsci)
library(patchwork)
p1 <- ggplot(ESTIMATE_score, aes(x = group, y = StromalScore, fill = group)) +
geom_boxplot(position = position_dodge(0.8)) +
scale_fill_nejm() +
labs(x = "", y = 'Stromal Score') +
stat_compare_means() +
theme_bw(base_size = 16) +
theme(axis.text.x = element_text(angle = 30,vjust = 0.85,hjust = 0.75),
legend.position = 'none' # 去除注释图例
)
p2 <- ggplot(ESTIMATE_score, aes(x = group, y = ImmuneScore, fill = group)) +
geom_boxplot(position = position_dodge(0.8)) +
scale_fill_nejm() +
labs(x = "", y = 'ImmuneScore Score') +
stat_compare_means() +
theme_bw(base_size = 16) +
theme(axis.text.x = element_text(angle = 30,vjust = 0.85,hjust = 0.75),
legend.position = 'none' # 去除注释图例
)
p3 <- ggplot(ESTIMATE_score, aes(x = group, y = ESTIMATEScore, fill = group)) +
geom_boxplot(position = position_dodge(0.8)) +
scale_fill_nejm() +
labs(x = "", y = 'ESTIMATE Score') +
stat_compare_means() +
theme_bw(base_size = 16)
p1 + p2 + p3 + plot_layout(ncol = 3)
ESTIMATE-1