主要照着老大的这篇教程来的TCGA
生存分析的统计学探究 (老大的)
Surv:用于创建生存数据对象
survfit:创建KM生存曲线或是Cox调整生存曲线
survdiff:用于不同组的统计检验
###先把这些代码保存下来
library(survival)
my.surv <- Surv(OS_MONTHS,OS_STATUS==’LIVING’) ##这个生存对象是看看病人的总生存期与死亡状态的关系
##这个Surv函数第一个参数必须是数值型的时间,第二个参数是逻辑向量,1,0表示死亡与否
kmfit1 <- survfit(my.surv~1) ## 直接对生存对象拟合生存函数
summary(kmfit1)
plot(kmfit1) ##画出生存曲线
survdiff(my.surv~type, data=dat) ### 根据生存对象再加上一个分组因子来拟合生存函数,并且比较不同因子分组的生存效果。
library(cgdsr)
mycgds <- CGDS(‘http://www.cbioportal.org/public-portal/‘)
test(mycgds)
all_TCGA_studies <- getCancerStudies(mycgds)
all_tables <- getCaseLists(mycgds, ‘ov_tcga_pub’)
all_dataset<- getGeneticProfiles(mycgds, ‘ov_tcga_pub’)
BRCA1 <- getProfileData(mycgds, my_gene, my_dataset, my_table)
ov_tcga_pub_meth1<- getClinicalData(mycgds, all_tables[8,1])
ov_tcga_pub_meth2<- getClinicalData(mycgds, all_tables[9,1])
ov_tcga_pub_meth3<- getClinicalData(mycgds, all_tables[10,1])
ov_tcga_pub_meth4<- getClinicalData(mycgds, all_tables[11,1])
library(survival)
attach(ov_tcga_pub_meth1)
## 估计KM生存曲线
my.surv <- Surv(OS_MONTHS,OS_STATUS==’LIVING’)
kmfit1 <- survfit(my.surv~1)
summary(kmfit1)
plot(kmfit1)
kmfit2 <- survfit(my.surv~TUMOR_STAGE_2009)
summary(kmfit2)
plot(kmfit2,col = rainbow(length(unique(ov_tcga_pub_meth1[,13]))))
detach(ov_tcga_pub_meth1)
ov_tcga_pub_meth1$sample<- rownames(ov_tcga_pub_meth1)
ov_tcga_pub_meth2$sample<- rownames(ov_tcga_pub_meth2)
ov_tcga_pub_meth3$sample<- rownames(ov_tcga_pub_meth3)
ov_tcga_pub_meth4$sample<- rownames(ov_tcga_pub_meth4)
ov_tcga_pub_meth1$type<- ‘meth1′
ov_tcga_pub_meth2$type<- ‘meth2′
ov_tcga_pub_meth3$type<- ‘meth3′
ov_tcga_pub_meth4$type<- ‘meth4′
dat=rbind(ov_tcga_pub_meth1,ov_tcga_pub_meth2,ov_tcga_pub_meth3,ov_tcga_pub_meth4)
attach(dat)
## 根据分组type估计KM生存曲线
my.surv <- Surv(OS_MONTHS,OS_STATUS==’LIVING’)
kmfit3 <- survfit(my.surv~type)
summary(kmfit3)
plot(kmfit3,col = rainbow(4))
##由图中可以看到甲基化数据分成的4个组,生存率差异还是蛮大的
##用图形方法检验PH假设
plot(kmfit3,fun=’cloglog’,col = rainbow(4))
# 检验显著性
survdiff(my.surv~type, data=dat)
# 用strata来控制协变量的影响
survdiff(my.surv~type+strata(TUMOR_STAGE_2009),data=dat)
2.文章中提到:使用R包cgdsr来下载TCGA的数据http://www.bio-info-trainee.com/1257.html
getCancerStudies,getCaseLists,getGeneticProfiles,getProfileData
TCGA的28篇教程- 使用R语言的RTCGA包获取TCGA数据
https://mp.weixin.qq.com/s/JB_329LCWqo5dY6MLawfEA
1.指定任意基因从任意癌症里面获取芯片表达数据
compare_means(c(GATA3, PTEN, XBP1) ~ dataset, data = expr)
2.指定任意基因从任意癌症里面获取测序表达数据
3.用全部的rnaseq的表达数据来做主成分分析
4.用5个基因在3个癌症的表达量做主成分分析
5.用突变数据做生存分析
6.多个基因在多种癌症的表达量热图
==代码==
#1.指定任意基因从任意癌症里面获取芯片表达数据
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
library(BiocManager)
BiocManager::install("RTCGA.clinical")
BiocManager::install("RTCGA.mRNA")
BiocManager::install('RTCGA.mutations')
library(RTCGA.mutations)
expr <- expressionsTCGA(BRCA.mRNA, OV.mRNA, LUSC.mRNA,
extract.cols = c("GATA3", "PTEN", "XBP1","ESR1", "MUC1"))
expr
head <- head(expr)
nb_samples <- table(expr$dataset)
nb_samples
expr$dataset <- gsub(pattern = ".mRNA", replacement = "", expr$dataset)
expr$dataset
expr$bcr_patient_barcode <- paste0(expr$dataset, c(1:590, 1:561, 1:154))
expr$bcr_patient_barcode
library(ggpubr)
## Loading required package: ggplot2
## Loading required package: magrittr
# GATA3
ggboxplot(expr, x = "dataset", y = "GATA3",
title = "GATA3", ylab = "Expression",
color = "dataset", palette = "jco")
ggboxplot(expr, x = "dataset", y = "PTEN",
title = "PTEN", ylab = "Expression",
color = "dataset", palette = "jco")
my_comparisons <- list(c("BRCA", "OV"), c("OV", "LUSC"))#加上Pvalues
ggboxplot(expr, x = "dataset", y = "GATA3",
title = "GATA3", ylab = "Expression",
color = "dataset", palette = "jco")+
stat_compare_means(comparisons = my_comparisons)
compare_means(c(GATA3, PTEN, XBP1) ~ dataset, data = expr)
label.select.criteria <- list(criteria = "`y` > 10 & `x` %in% c('BRCA', 'OV')")
ggboxplot(expr, x = "dataset",
y = c("GATA3", "PTEN", "XBP1"),
combine = TRUE,
color = "dataset", palette = "jco",
ylab = "Expression",
label = "bcr_patient_barcode", # column containing point labels
label.select = label.select.criteria, # Select some labels to display
font.label = list(size = 9, face = "italic"), # label font
repel = TRUE # Avoid label text overplotting
)
ggboxplot(expr, x = "dataset", y = "GATA3",
title = "GATA3", ylab = "Expression",
color = "dataset", palette = "jco",
rotate = TRUE)
#2.指定任意基因从任意癌症里面获取测序表达数据
library(RTCGA)
library(RTCGA.rnaseq)
expr <- expressionsTCGA(BRCA.rnaseq, OV.rnaseq, LUSC.rnaseq,
extract.cols = c("GATA3|2625", "PTEN|5728", "XBP1|7494","ESR1|2099", "MUC1|4582"))
expr
nb_samples <- table(expr$dataset)
nb_samples
library(ggpubr)
# ESR1|2099
ggboxplot(expr, x = "dataset", y = "`PTEN|5728`",
title = "PTEN|5728", ylab = "Expression",
color = "dataset", palette = "jco")
#3.用全部的rnaseq的表达数据来做主成分分析
library(RTCGA.rnaseq)
library(dplyr)
expressionsTCGA(BRCA.rnaseq, OV.rnaseq, HNSC.rnaseq) %>%
dplyr::rename(cohort = dataset) %>%
filter(substr(bcr_patient_barcode, 14, 15) == "01") -> BRCA.OV.HNSC.rnaseq.cancer
pcaTCGA(BRCA.OV.HNSC.rnaseq.cancer, "cohort") -> pca_plot
plot(pca_plot)
save(pca_plot,file = "pca-png")
#4.用5个基因在3个癌症的表达量做主成分分析
BiocManager::install('DT')
library(DT)
expr %>%
filter(substr(bcr_patient_barcode, 14, 15) == "01") -> rnaseq.5genes.3cancers
DT::datatable(rnaseq.5genes.3cancers)
pcaTCGA(rnaseq.5genes.3cancers, "dataset") -> pca_plot
plot(pca_plot)
#5.用突变数据做生存分析
library(RTCGA.mutations)
# library(dplyr) if did not load at start
library(survminer)
mutationsTCGA(BRCA.mutations, OV.mutations) %>%
filter(Hugo_Symbol == 'TP53') %>%
filter(substr(bcr_patient_barcode, 14, 15) ==
"01") %>% # cancer tissue
mutate(bcr_patient_barcode =
substr(bcr_patient_barcode, 1, 12)) ->
BRCA_OV.mutations
library(RTCGA.clinical)
survivalTCGA(
BRCA.clinical,
OV.clinical,
extract.cols = "admin.disease_code"
) %>%
dplyr::rename(disease = admin.disease_code) ->
BRCA_OV.clinical
BRCA_OV.clinical %>%
left_join(
BRCA_OV.mutations,
by = "bcr_patient_barcode"
) %>%
mutate(TP53 =
ifelse(!is.na(Variant_Classification), "Mut","WILDorNOINFO")) ->
BRCA_OV.clinical_mutations
BRCA_OV.clinical_mutations %>%
select(times, patient.vital_status, disease, TP53) -> BRCA_OV.2plot
kmTCGA(
BRCA_OV.2plot,
explanatory.names = c("TP53", "disease"),
break.time.by = 400,
xlim = c(0,2000),
pval = TRUE) -> km_plot
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
print(km_plot)
#6.多个基因在多种癌症的表达量热图
library(RTCGA.rnaseq)
# perfrom plot
# library(dplyr) if did not load at start
expressionsTCGA(
ACC.rnaseq,
BLCA.rnaseq,
BRCA.rnaseq,
OV.rnaseq,
extract.cols =
c("MET|4233",
"ZNF500|26048",
"ZNF501|115560")
) %>%
dplyr::rename(cohort = dataset,
MET = `MET|4233`) %>%
#cancer samples
filter(substr(bcr_patient_barcode, 14, 15) ==
"01") %>%
mutate(MET = cut(MET,
round(quantile(MET, probs = seq(0,1,0.25)), -2),
include.lowest = TRUE,
dig.lab = 5)) -> ACC_BLCA_BRCA_OV.rnaseq
ACC_BLCA_BRCA_OV.rnaseq %>%
select(-bcr_patient_barcode) %>%
group_by(cohort, MET) %>%
summarise_each(funs(median)) %>%
mutate(ZNF500 = round(`ZNF500|26048`),
ZNF501 = round(`ZNF501|115560`)) ->
ACC_BLCA_BRCA_OV.rnaseq.medians
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
heatmapTCGA(ACC_BLCA_BRCA_OV.rnaseq.medians,
"cohort", "MET", "ZNF500",
title = "Heatmap of ZNF500 expression")