下载并预处理TCGA数据

本文为TCGA数据的下载,并整理为行名为基因名的数据结构

方法一

#数据下载的网站,下载下来并命名为HNSC_RSEM_genes_normalized.txt
#http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/HNSC/20160128/gdac.broadinstitute.org_HNSC.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0.tar.gz.md5
library(stringr)
hnsc<-read.table("your_dir/HNSC_RSEM_genes_normalized.txt",header = T,check.names = F,sep="\t")
hnsc<-hnsc[-1,]
row_name<-as.character(hnsc[,1])
row_name<-unlist(lapply(row_name, FUN = function(x) {return(strsplit(x, split = "|",fixed = T)[[1]][1])}))
hnsc[,1]<-row_name
hnsc<-hnsc[!duplicated(hnsc[,1]),]
row.names(hnsc)<-as.character(hnsc[,1])
hnsc<-hnsc[,-1]
col_names<-colnames(hnsc)
new_names<-unlist(lapply(col_names, FUN = function(x) {return(substr(x,1,16))}))
colnames(hnsc)<-new_names
write.csv(hnsc,"your_dir/hnsc_clean_data.csv")

方法二

乳腺癌PAM50

suppressMessages(library(TCGAbiolinks))
BRCA_path_subtypes <- TCGAquery_subtype(tumor = "brca")

另一种方法,生存分析

我觉得这确实目前最好的方法,同时这个是官方下载并实时更新的数据下载方式,但是很容易出现报错(Error in xed : operator is invalid for atomic vectors),解决方式为用github源进行安装,这个很重要。
具体代码代码如下

#https://bioconductor.org/packages/release//workflows/vignettes/SingscoreAMLMutations/inst/doc/workflow_transcriptional_mut_sig_chinese.html
library(devtools)
devtools::install_github("Bioconductor-mirror/biomaRt")
devtools::install_github("BioinformaticsFMRP/TCGAbiolinks")#IF Error: $ operator is invalid for atomic vectors should run this


rm(list=ls())
library(stringr)
cancer_type="TCGA-BRCA"
#get GDC version information
gdc_info = getGDCInfo()
Release<-as.character(gdc_info$data_release)
release<-substr(Release,1,17)

###download BRCA counts data

Download_TCGA<-function(cancer_type,release){
  suppressMessages(library(TCGAbiolinks)) 
  suppressMessages(library(SummarizedExperiment))
  suppressMessages(library(dplyr))
  suppressMessages(library(DT))
  counts_query <- GDCquery(project = cancer_type, 
                           data.category = "Transcriptome Profiling", 
                           data.type = "Gene Expression Quantification", 
                           workflow.type = "HTSeq - Counts")
  GDCdownload(counts_query)
  counts_expdat <- GDCprepare(query =counts_query)
  count_matrix= as.data.frame(assay(counts_expdat))
  write.csv(count_matrix,file = paste(cancer_type,"Counts_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("TP"))
  write.csv(count_matrix[,sample_NT],file = paste(cancer_type,"Counts_normal_",release,".csv",sep = "-"))
  write.csv(count_matrix[,sample_TP],file = paste(cancer_type,"Counts_tumor_",release,".csv",sep = "-"))
  
  ###download BRCA FPKM data
  FPKM_query <- GDCquery(project = cancer_type, 
                         data.category = "Transcriptome Profiling", 
                         data.type = "Gene Expression Quantification", 
                         workflow.type = "HTSeq - FPKM")
  GDCdownload(FPKM_query)
  FPKM_expdat <- GDCprepare(query =FPKM_query)
  FPKM_matrix= as.data.frame(assay(FPKM_expdat))
  write.csv(FPKM_matrix,file = paste(cancer_type,"FPKM_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("TP"))
  write.csv(FPKM_matrix[,sample_NT],file = paste(cancer_type,"FPKM_normal_",release,".csv",sep = "-"))
  write.csv(FPKM_matrix[,sample_TP],file = paste(cancer_type,"FPKM_tumor_",release,".csv",sep = "-"))
  
  #########
  clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
  
  ## ----echo=TRUE, message=FALSE, warning=FALSE-----------------------------
  datatable(clinical, filter = 'top', 
            options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
            rownames = FALSE)
  write.csv(clinical,"clinical_full.csv")
  
  ##origanize the clinical data (shuould do some ajustion maybe try next time)
  
  CLC_query <- GDCquery(project = cancer_type, 
                        data.category = "Clinical", 
                        file.type = "xml")
  GDCdownload(CLC_query)
  clinical <- GDCprepare_clinic(CLC_query, clinical.info = "patient")
  
  
  
  clinical_trait <- clinical  %>%
    dplyr::select(bcr_patient_barcode,gender,vital_status,                            
                  days_to_death,days_to_last_followup,race_list,
                  person_neoplasm_cancer_status,
                  stage_event_pathologic_stage,             
                  stage_event_tnm_categories  ) %>%
    distinct( bcr_patient_barcode, .keep_all = TRUE)  
  
  
  #organize sur data
  dead_patient <- clinical_trait  %>%
    dplyr::filter(vital_status == 'Dead') %>%
    dplyr::select(-days_to_last_followup) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_death='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  
  
  #organize the clc data
  alive_patient <- clinical_trait %>%
    dplyr::filter(vital_status == 'Alive') %>%
    dplyr::select(-days_to_death) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_last_followup='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  #combine clincial data
  survival_data <- rbind(dead_patient,alive_patient)
  write.csv(survival_data , file = paste(cancer_type,"clinical_",release,".csv",sep = "-"))
  
  #download Copy Number Variation data
  CNV_query <- GDCquery(project = cancer_type, 
                        data.category = "Copy Number Variation", 
                        data.type = "Copy Number Segment")
  
  GDCdownload(CNV_query)
  CNV_expdat <- GDCprepare(query = CNV_query)
  CNV_count_matrix=as.data.frame(assay(CNV_expdat))
  write.csv(CNV_count_matrix,file = paste(cancer_type,"Copy-Number-Variation_",release,".csv",sep = "-"))
  
  #download methylation
  meth_query <- GDCquery(project =cancer_type,
                         legacy = TRUE,
                         data.category = "DNA methylation")
  GDCdownload(meth_query)
  meth_expdat <- GDCprepare(query = meth_query)
  meth_count_matrix=assay(meth_expdat)
  write.csv(meth_count_matrix,file = paste(cancer_type,"methylation_",release,"2.csv",sep = "-"))
  ####download miR data
  miR_query <- GDCquery(project = cancer_type, 
                        data.category = "Transcriptome Profiling", 
                        data.type = "miRNA Expression Quantification", 
                        workflow.type = "BCGSC miRNA Profiling")
  GDCdownload(miR_query)
  miR_expdat <- GDCprepare(query = miR_query)
  write.csv(miR_expdat,file = paste(cancer_type,"miRNAs_",release,".csv",sep = "-"))
  row.names(miR_expdat)<-as.character(miR_expdat[,1])
  miR_expdat<-miR_expdat[,-1]
 col_name<-unlist(lapply(colnames(miR_expdat), FUN = function(x) {return(strsplit(x, split = "TCGA",fixed = T)[[1]][2])}))
 col_name<-col_name[!duplicated(col_name)]
 rpkm_names<-paste("reads_per_million_miRNA_mapped_TCGA",col_name,sep = "")
 count_names<-paste("read_count_TCGA",col_name,sep = "")
 write.csv(miR_expdat[,rpkm_names],file = paste(cancer_type,"miRNAs_RPKM",release,".csv",sep = "-"))
 write.csv(miR_expdat[,count_names],file = paste(cancer_type,"miRNAs_",release,".csv",sep = "-"))
}
Download_TCGA(cancer_type,release)

三、多线程批量下载所有TCGA

调用19线程

#!/usr/bin/env Rscript
rm(list=ls())
library(stringr)
library(parallel)
cancerType<-read.csv("projects.csv",header = T)
cancer_type<-as.character(cancerType$project_id)


###download counts data

Download_TCGA<-function(cancer_type){
  suppressMessages(library(TCGAbiolinks)) 
  suppressMessages(library(SummarizedExperiment))
  suppressMessages(library(dplyr))
  suppressMessages(library(DT))
  dir= "~/Desktop/tcga_test" #should change this before you run
  out_dir=paste0(dir,"/",cancer_type)
  dir.create(out_dir,recursive = T)
  setwd(out_dir)
  #get GDC version information
  gdc_info = getGDCInfo()
  Release<-as.character(gdc_info$data_release)
  release<-substr(Release,1,17)
  counts_query <- GDCquery(project = cancer_type, 
                           data.category = "Transcriptome Profiling", 
                           data.type = "Gene Expression Quantification", 
                           workflow.type = "HTSeq - Counts")
  GDCdownload(counts_query)
  counts_expdat <- GDCprepare(query =counts_query)
  count_matrix= as.data.frame(assay(counts_expdat))
  write.csv(count_matrix,file = paste(cancer_type,"Counts_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("TP"))
  write.csv(count_matrix[,sample_NT],file = paste(cancer_type,"Counts_normal_",release,".csv",sep = "-"))
  write.csv(count_matrix[,sample_TP],file = paste(cancer_type,"Counts_tumor_",release,".csv",sep = "-"))
  
  ###download FPKM data
  FPKM_query <- GDCquery(project = cancer_type, 
                         data.category = "Transcriptome Profiling", 
                         data.type = "Gene Expression Quantification", 
                         workflow.type = "HTSeq - FPKM")
  GDCdownload(FPKM_query)
  FPKM_expdat <- GDCprepare(query =FPKM_query)
  FPKM_matrix= as.data.frame(assay(FPKM_expdat))
  write.csv(FPKM_matrix,file = paste(cancer_type,"FPKM_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("TP"))
  write.csv(FPKM_matrix[,sample_NT],file = paste(cancer_type,"FPKM_normal_",release,".csv",sep = "-"))
  write.csv(FPKM_matrix[,sample_TP],file = paste(cancer_type,"FPKM_tumor_",release,".csv",sep = "-"))
  
  #########
  clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
  
  ## ----echo=TRUE, message=FALSE, warning=FALSE-----------------------------
  datatable(clinical, filter = 'top', 
            options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
            rownames = FALSE)
  write.csv(clinical,"clinical_full.csv")
  
  ##origanize the clinical data (shuould do some ajustion maybe try next time)
  
  CLC_query <- GDCquery(project = cancer_type, 
                        data.category = "Clinical", 
                        file.type = "xml")
  GDCdownload(CLC_query)
  clinical <- GDCprepare_clinic(CLC_query, clinical.info = "patient")
  
  
  
  clinical_trait <- clinical  %>%
    dplyr::select(bcr_patient_barcode,gender,vital_status,                            
                  days_to_death,days_to_last_followup,race_list,
                  person_neoplasm_cancer_status,
                  stage_event_pathologic_stage,             
                  stage_event_tnm_categories  ) %>%
    distinct( bcr_patient_barcode, .keep_all = TRUE)  
  
  
  #organize sur data
  dead_patient <- clinical_trait  %>%
    dplyr::filter(vital_status == 'Dead') %>%
    dplyr::select(-days_to_last_followup) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_death='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  
  
  #organize the clc data
  alive_patient <- clinical_trait %>%
    dplyr::filter(vital_status == 'Alive') %>%
    dplyr::select(-days_to_death) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_last_followup='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  #combine clincial data
  survival_data <- rbind(dead_patient,alive_patient)
  write.csv(survival_data , file = paste(cancer_type,"clinical_",release,".csv",sep = "-"))
  
  #download Copy Number Variation data
  CNV_query <- GDCquery(project = cancer_type, 
                        data.category = "Copy Number Variation", 
                        data.type = "Copy Number Segment")
  
  GDCdownload(CNV_query)
  CNV_expdat <- GDCprepare(query = CNV_query)
  CNV_count_matrix=as.data.frame(assay(CNV_expdat))
  write.csv(CNV_count_matrix,file = paste(cancer_type,"Copy-Number-Variation_",release,".csv",sep = "-"))
  
  #download methylation
  meth_query <- GDCquery(project =cancer_type,
                         legacy = TRUE,
                         data.category = "DNA methylation")
  GDCdownload(meth_query)
  meth_expdat <- GDCprepare(query = meth_query)
  meth_count_matrix=assay(meth_expdat)
  write.csv(meth_count_matrix,file = paste(cancer_type,"methylation_",release,"2.csv",sep = "-"))
  ####download miR data
  miR_query <- GDCquery(project = cancer_type, 
                        data.category = "Transcriptome Profiling", 
                        data.type = "miRNA Expression Quantification", 
                        workflow.type = "BCGSC miRNA Profiling")
  GDCdownload(miR_query)
  miR_expdat <- GDCprepare(query = miR_query)
  miR_expdat_matrix=assay(miR_expdat)
  write.csv(miR_expdat_matrix,file = paste(cancer_type,"miRNAs_",release,"2.csv",sep = "-"))
  message(paste0(cancer_type," Download Finished!"))
}
cl <- makeCluster(19)
parLapply(cl,cancer_type,Download_TCGA)
stopCluster(cl)
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 213,711评论 6 493
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 91,079评论 3 387
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 159,194评论 0 349
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 57,089评论 1 286
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 66,197评论 6 385
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 50,306评论 1 292
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,338评论 3 412
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,119评论 0 269
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,541评论 1 306
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 36,846评论 2 328
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,014评论 1 341
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 34,694评论 4 337
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,322评论 3 318
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,026评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,257评论 1 267
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 46,863评论 2 365
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 43,895评论 2 351

推荐阅读更多精彩内容

  • 父类实现深拷贝时,子类如何实现深度拷贝。父类没有实现深拷贝时,子类如何实现深度拷贝。• 深拷贝同浅拷贝的区别:浅拷...
    JonesCxy阅读 995评论 1 7
  • • 深拷贝同浅拷贝的区别:浅拷贝是指针拷贝,对一个对象进行浅拷贝,相当于对指向对象的指针进行复制,产生一个新的指向...
    WSGNSLog阅读 1,251评论 0 1
  • 人生只有一条路 你我只能走一次 我慢慢走 你缓缓来 不用着急 因为还有一生的路好走
    看她笑i阅读 323评论 0 0
  • 做DNS开发时,无论是安装bind还是nsd,都需要一份zone文件进行测试,最简单的zone文件包括 SOA记录...
    dnsir阅读 3,503评论 0 0