TCGAbiolinks 是一个用于TCGA数据综合分析的R/BioConductor软件包,能够通过GDC Application Programming Interface (API)访问 National Cancer Institute (NCI) Genomic Data Commons (GDC) ,来搜索、下载和准备TCGA相关数据,以便在R中进行分析。
TCGA数据库提供的下载数据入口:https://portal.gdc.cancer.gov
rm(list = ls()) ##魔幻操作,一键清空~
options(stringsAsFactors = F)
#安装包&加载包
if (!require(TCGAbiolinks)) {
install.packages("TCGAbiolinks")
library(TCGAbiolinks)}
if (!require(SummarizedExperiment)) {
install.packages("SummarizedExperiment")
library(SummarizedExperiment)}
#TCGAbiolinks:::getGDCprojects()$project_id ##查询各个癌种的项目id
#TCGAbiolinks:::getProjectSummary("TCGA-LUAD") ##查看project中有哪些数据类型,如查询"TCGA-LUAD"
#一般包括3步,GDCquery、GDCdownload和GDCprepare
#数据检索GDCquery
#1)下载转录组数据(counts)
query <- GDCquery(project = "TCGA-LUAD",#各个癌种的项目id
data.category = "Transcriptome Profiling",#数据类型,转录组数据
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
#进行下载,若本地有,则进行报告,无则下载
#GDCdownload(query)
GDCdownload(query,
method = "api",
files.per.chunk = 20)# 拆分下载,减少下载失败风险
#将下载的数据进行整理
data<-GDCprepare(query,save=FALSE)####一般到GDCprepare就齐活了
LUAD_Data <- assay(data)
dim(LUAD_Data)
save(data,file='LUAD_counts.Rdata')
write.csv(data,file="TCGAbiolinks-LUAD-counts.csv")
#data <- GDCprepare(query = query, save = TRUE, save.filename ="LUAD_counts.Rdata")
#summarizedExperiment|是否生成summarizedExperiment对象,默认TRUE
#2)下载转录组数据FPKM
query <- GDCquery(
project = "TCGA-LUAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - FPKM")
GDCdownload(query, files.per.chunk = 20)
Data<- GDCprepare(
query,
save = T,
save.filename = "LUAD_mRNA_FPKM.rdata")
write.csv(data,file="TCGAbiolinks-LUAD-FPKM.csv")
#3)临床信息下载
clinic <- GDCquery_clinic(project = "TCGA-LUAD",
type = 'Clinical')
dim(clinic)
#4)miRNA data数据下载
query <- GDCquery(project = "TCGA-LUAD",
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
workflow.type = "BCGSC miRNA Profiling")
GDCdownload(query,
method = "api",
files.per.chunk = 20)
mir_exp<- GDCprepare(query = query,
summarizedExperiment=F)# set F
write.csv(mir_exp,file="TCGAbiolinks-LUAD-miRNA.csv")
save(mir_exp,file="LUAD_miRNA_raw.Rdata")
#5)体细胞突变下载
data<-GDCquery_Maf("LUAD",pipelines = "mutect2")
save(data,file = "LUAD_mut.Rdata")
write.csv(data,file="TCGAbiolinks-LUAD-mutect2.csv")
#6)拷贝数变异数据(GISTIC2)
query <- GDCquery(
project = "TCGA-LUAD",
data.category = "Copy Number Variation",
data.type = "Gene Level Copy Number Scores",
access="open"
)
GDCdownload(query, files.per.chunk = 20)
LUAD_GISTIC2<-GDCprepare(query,save = F)
write.csv(LUAD_GISTIC2,file="TCGAbiolinks-LUAD_GISTIC2.csv")
save(LUAD_GISTIC2,file="LUAD_GISTIC2.Rdata")
#7)DNA甲基化
query <- GDCquery(
project = "TCGA-LUAD",
data.category = "DNA Methylation",
platform = "Illumina Human Methylation 450"
)
GDCdownload(query, files.per.chunk = 100)
LUAD_DNAme<-GDCprepare(query,
save = F)
write.csv(LUAD_DNAme,file="TCGAbiolinks-LUAD-methylation 450.csv")
save(LUAD_DNAme,file="LUAD_methylation.Rdata")