ICGC数据下载和整理
1.数据下载
找到自己要的项目下载即可。以RECA-EU为例。
exp_seq.RECA-EU.tsv.gz
specimen.RECA-EU.tsv.gz
2. 整理表达矩阵
rm(list=ls())
library(tidyverse)
dat = data.table::fread("exp_seq.RECA-EU.tsv.gz",data.table = F)
dat[1:4,1:4]
## icgc_donor_id project_code icgc_specimen_id icgc_sample_id
## 1 DO47068 RECA-EU SP103448 SA507253
## 2 DO47068 RECA-EU SP103448 SA507253
## 3 DO47068 RECA-EU SP103448 SA507253
## 4 DO47068 RECA-EU SP103448 SA507253
colnames(dat)
## [1] "icgc_donor_id" "project_code"
## [3] "icgc_specimen_id" "icgc_sample_id"
## [5] "submitted_sample_id" "analysis_id"
## [7] "gene_model" "gene_id"
## [9] "normalized_read_count" "raw_read_count"
## [11] "fold_change" "assembly_version"
## [13] "platform" "total_read_count"
## [15] "experimental_protocol" "alignment_algorithm"
## [17] "normalization_algorithm" "other_analysis_algorithm"
## [19] "sequencing_strategy" "raw_data_repository"
## [21] "raw_data_accession" "reference_sample_type"
我们需要count,基因id,还有样本ID和病人ID列。病人和样本ID合并为一列
dat = dat[,c(
'icgc_specimen_id',
'icgc_donor_id',
'gene_id',
'raw_read_count'
)]
dat = unite(dat, "sample_id", icgc_specimen_id, icgc_donor_id)
head(dat)
## sample_id gene_id raw_read_count
## 1 SP103448_DO47068 ENSG00000000003 1986
## 2 SP103448_DO47068 ENSG00000000005 1
## 3 SP103448_DO47068 ENSG00000000419 1418
## 4 SP103448_DO47068 ENSG00000000457 252
## 5 SP103448_DO47068 ENSG00000000460 225
## 6 SP103448_DO47068 ENSG00000000938 382
长变宽,变成常见的表达矩阵的样子
exp = dat%>% pivot_wider(names_from = sample_id, values_from = raw_read_count) %>%
column_to_rownames("gene_id") %>%
as.matrix()
exp[1:3,1:3]
## SP103448_DO47068 SP103459_DO47072 SP103420_DO47056
## ENSG00000000003 1986 732 423
## ENSG00000000005 1 1 19
## ENSG00000000419 1418 295 216
3.分组信息
pd = data.table::fread('specimen.RECA-EU.tsv.gz',data.table = F)
colnames(pd)
## [1] "icgc_specimen_id" "project_code"
## [3] "study_specimen_involved_in" "submitted_specimen_id"
## [5] "icgc_donor_id" "submitted_donor_id"
## [7] "specimen_type" "specimen_type_other"
## [9] "specimen_interval" "specimen_donor_treatment_type"
## [11] "specimen_donor_treatment_type_other" "specimen_processing"
## [13] "specimen_processing_other" "specimen_storage"
## [15] "specimen_storage_other" "tumour_confirmed"
## [17] "specimen_biobank" "specimen_biobank_id"
## [19] "specimen_available" "tumour_histological_type"
## [21] "tumour_grading_system" "tumour_grade"
## [23] "tumour_grade_supplemental" "tumour_stage_system"
## [25] "tumour_stage" "tumour_stage_supplemental"
## [27] "digital_image_of_stained_section" "percentage_cellularity"
## [29] "level_of_cellularity"
pd = pd[,c(
'icgc_specimen_id',
'icgc_donor_id',
'specimen_type'
)]
pd = unite(pd, "sample_id", icgc_specimen_id, icgc_donor_id)
head(pd)
## sample_id specimen_type
## 1 SP103444_DO47068 Primary tumour - solid tissue
## 2 SP103452_DO47068 Normal - blood derived
## 3 SP103448_DO47068 Primary tumour - solid tissue
## 4 SP103436_DO47064 Primary tumour - solid tissue
## 5 SP103440_DO47064 Normal - blood derived
## 6 SP103463_DO47072 Normal - blood derived
pd = pd[match(colnames(exp),pd$sample_id),]
identical(colnames(exp),pd$sample_id)
## [1] TRUE
table(pd$specimen_type)
##
## Normal - tissue adjacent to primary Primary tumour - solid tissue
## 45 91
Group = ifelse(str_detect(pd$specimen_type,"tumour"),
'tumor','normal')
table(Group)
## Group
## normal tumor
## 45 91
4.行名转换
有的数据下载下来就已经是symbol,不需要转换,也有的像这个一样是ENSEMBLID,简单转换一下。
library(tinyarray)
exp = trans_exp_new(exp)
save(exp,pd,Group,file = "RECA_exp_pd_Group.Rdata")