0.数据下载
网址:https://sites.broadinstitute.org/ccle
Datasets-Broad DepMap Portal-CCLE2019-Allfiles,选择tpm和annotation文件下载。
1.读取表达矩阵
rm(list = ls())
dat = data.table::fread("CCLE_RNAseq_rsem_genes_tpm_20180929.txt.gz",data.table = F)
dat[1:4,1:4]
## gene_id
## 1 ENSG00000000003.10
## 2 ENSG00000000005.5
## 3 ENSG00000000419.8
## 4 ENSG00000000457.9
## transcript_ids
## 1 ENST00000373020.4,ENST00000494424.1,ENST00000496771.1
## 2 ENST00000373031.4,ENST00000485971.1
## 3 ENST00000371582.4,ENST00000371583.5,ENST00000371584.4,ENST00000371588.5,ENST00000413082.1,ENST00000466152.1,ENST00000494752.1
## 4 ENST00000367770.1,ENST00000367771.6,ENST00000367772.4,ENST00000423670.1,ENST00000470238.1
## 22RV1_PROSTATE 2313287_STOMACH
## 1 5.28 7.01
## 2 0.00 0.00
## 3 73.38 108.99
## 4 9.76 16.76
exp = as.matrix(dat[,-(1:2)])
library(stringr)
rownames(exp) = str_split(dat$gene_id,"\\.",simplify = T)[,1]
exp = log2(exp+1)
exp[1:4,1:4]
## 22RV1_PROSTATE 2313287_STOMACH 253JBV_URINARY_TRACT
## ENSG00000000003 2.650765 3.001802 4.572890
## ENSG00000000005 0.000000 0.000000 0.000000
## ENSG00000000419 6.216843 6.781229 5.845741
## ENSG00000000457 3.427606 4.150560 1.839960
## 253J_URINARY_TRACT
## ENSG00000000003 4.577731
## ENSG00000000005 0.000000
## ENSG00000000419 5.535742
## ENSG00000000457 2.087463
# 转换行名
library(AnnoProbe)
library(tinyarray)
an = annoGene(rownames(exp),ID_type = "ENSEMBL")
exp = trans_array(exp,ids = an,from = "ENSEMBL",to = "SYMBOL")
exp[1:4,1:4]
## 22RV1_PROSTATE 2313287_STOMACH 253JBV_URINARY_TRACT
## DDX11L1 0.1634987 0.0000000 0.02856915
## WASH7P 4.5422580 4.1667154 3.79285535
## MIR1302-2HG 0.0000000 0.1505597 0.00000000
## FAM138A 0.0000000 0.0000000 0.95605665
## 253J_URINARY_TRACT
## DDX11L1 0.0000000
## WASH7P 3.5861642
## MIR1302-2HG 0.0000000
## FAM138A 0.5753123
2. 读取注释信息
clinical = read.delim("Cell_lines_annotations_20181226.txt")
colnames(clinical)[c(1,5)] = c("id","site")
3.表达矩阵和临床信息对应起来
a = intersect(colnames(exp),clinical$id)
exp = exp[,a]
clinical = clinical[match(a,clinical$id),]
identical(clinical$id,colnames(exp))
## [1] TRUE
4. 单基因表达量画图
library(dplyr)
#"METTL3","SETD2","TP53"
g = "METTL3"
pdat = cbind(gene = exp[g,],clinical[,c(1,5)])
library(tidyr)
pdat = drop_na(pdat,site)
su = group_by(pdat,site) %>%
summarise(a = median(gene)) %>%
arrange(desc(a))
pdat$site = factor(pdat$site,levels = su$site)
library(ggplot2)
library(RColorBrewer)
mypalette <- colorRampPalette(brewer.pal(8,"Set1"))
ggplot(pdat,aes(x = site,y = gene,fill = site))+
geom_boxplot()+
theme_bw()+
theme(axis.text.x = element_text(vjust = 1,hjust = 1,angle = 70),legend.position = "bottom")+
scale_fill_manual(values = mypalette(25))+
guides (fill=guide_legend (nrow=3, byrow=TRUE))