ICGC数据下载和整理

ICGC数据下载和整理

1.数据下载

https://dcc.icgc.org/

https://dcc.icgc.org/releases/current/Projects

找到自己要的项目下载即可。以RECA-EU为例。

exp_seq.RECA-EU.tsv.gz
specimen.RECA-EU.tsv.gz

2. 整理表达矩阵

rm(list=ls())
library(tidyverse)
dat = data.table::fread("exp_seq.RECA-EU.tsv.gz",data.table = F)
dat[1:4,1:4]

##   icgc_donor_id project_code icgc_specimen_id icgc_sample_id
## 1       DO47068      RECA-EU         SP103448       SA507253
## 2       DO47068      RECA-EU         SP103448       SA507253
## 3       DO47068      RECA-EU         SP103448       SA507253
## 4       DO47068      RECA-EU         SP103448       SA507253

colnames(dat) 

##  [1] "icgc_donor_id"            "project_code"            
##  [3] "icgc_specimen_id"         "icgc_sample_id"          
##  [5] "submitted_sample_id"      "analysis_id"             
##  [7] "gene_model"               "gene_id"                 
##  [9] "normalized_read_count"    "raw_read_count"          
## [11] "fold_change"              "assembly_version"        
## [13] "platform"                 "total_read_count"        
## [15] "experimental_protocol"    "alignment_algorithm"     
## [17] "normalization_algorithm"  "other_analysis_algorithm"
## [19] "sequencing_strategy"      "raw_data_repository"     
## [21] "raw_data_accession"       "reference_sample_type"

我们需要count,基因id,还有样本ID和病人ID列。病人和样本ID合并为一列

dat = dat[,c(
  'icgc_specimen_id', 
  'icgc_donor_id',
  'gene_id', 
  'raw_read_count' 
)]
dat = unite(dat, "sample_id", icgc_specimen_id, icgc_donor_id)
head(dat)

##          sample_id         gene_id raw_read_count
## 1 SP103448_DO47068 ENSG00000000003           1986
## 2 SP103448_DO47068 ENSG00000000005              1
## 3 SP103448_DO47068 ENSG00000000419           1418
## 4 SP103448_DO47068 ENSG00000000457            252
## 5 SP103448_DO47068 ENSG00000000460            225
## 6 SP103448_DO47068 ENSG00000000938            382

长变宽,变成常见的表达矩阵的样子

exp = dat%>% pivot_wider(names_from = sample_id, values_from = raw_read_count) %>%
  column_to_rownames("gene_id") %>%
  as.matrix()
exp[1:3,1:3]

##                 SP103448_DO47068 SP103459_DO47072 SP103420_DO47056
## ENSG00000000003             1986              732              423
## ENSG00000000005                1                1               19
## ENSG00000000419             1418              295              216

3.分组信息

pd = data.table::fread('specimen.RECA-EU.tsv.gz',data.table = F)
colnames(pd)

##  [1] "icgc_specimen_id"                    "project_code"                       
##  [3] "study_specimen_involved_in"          "submitted_specimen_id"              
##  [5] "icgc_donor_id"                       "submitted_donor_id"                 
##  [7] "specimen_type"                       "specimen_type_other"                
##  [9] "specimen_interval"                   "specimen_donor_treatment_type"      
## [11] "specimen_donor_treatment_type_other" "specimen_processing"                
## [13] "specimen_processing_other"           "specimen_storage"                   
## [15] "specimen_storage_other"              "tumour_confirmed"                   
## [17] "specimen_biobank"                    "specimen_biobank_id"                
## [19] "specimen_available"                  "tumour_histological_type"           
## [21] "tumour_grading_system"               "tumour_grade"                       
## [23] "tumour_grade_supplemental"           "tumour_stage_system"                
## [25] "tumour_stage"                        "tumour_stage_supplemental"          
## [27] "digital_image_of_stained_section"    "percentage_cellularity"             
## [29] "level_of_cellularity"

pd = pd[,c(
  'icgc_specimen_id', 
  'icgc_donor_id', 
  'specimen_type' 
)]
pd = unite(pd, "sample_id", icgc_specimen_id, icgc_donor_id)
head(pd)

##          sample_id                 specimen_type
## 1 SP103444_DO47068 Primary tumour - solid tissue
## 2 SP103452_DO47068        Normal - blood derived
## 3 SP103448_DO47068 Primary tumour - solid tissue
## 4 SP103436_DO47064 Primary tumour - solid tissue
## 5 SP103440_DO47064        Normal - blood derived
## 6 SP103463_DO47072        Normal - blood derived

pd = pd[match(colnames(exp),pd$sample_id),]
identical(colnames(exp),pd$sample_id)

## [1] TRUE

table(pd$specimen_type) 

## 
## Normal - tissue adjacent to primary       Primary tumour - solid tissue 
##                                  45                                  91

Group = ifelse(str_detect(pd$specimen_type,"tumour"),
                       'tumor','normal')
table(Group)

## Group
## normal  tumor 
##     45     91

4.行名转换

有的数据下载下来就已经是symbol,不需要转换,也有的像这个一样是ENSEMBLID,简单转换一下。

library(tinyarray)
exp = trans_exp_new(exp)
save(exp,pd,Group,file = "RECA_exp_pd_Group.Rdata")
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容