首先去到https://mp.weixin.qq.com/s/ZcOPNzcj1EZhrHPfUZfwyQ
找到数据集的相关信息:GSE101668
样本信息为:
GSM2711785 RKO-WT-rep1
GSM2711786 RKO-WT-rep2
GSM2711787 RKO-PRDM1-KO2-rep1
GSM2711788 RKO-PRDM1-KO2-rep2
GSM2711789 RKO-PRDM1-KO5-rep1
GSM2711790 RKO-PRDM1-KO5-rep2
GSM2711791 RKO-GFP-OE-rep1
GSM2711792 RKO-GFP-OE-rep2
GSM2711793 RKO-PRDM1α-OE-rep1
GSM2711794 RKO-PRDM1α-OE-rep2
GSM2711795 RKO-PRDM1β-OE-rep1
GSM2711796 RKO-PRDM1β-OE-rep2
这样本不多,分组倒是挺多的。要完成作业需要表达矩阵信息,但是我用getGEO,geoChina都下载不来原始表达矩阵信息,那就只能下载GSE101668_RAW.tar自己拼接了。
先打开开了一下这个压缩包里面有2种信息,每个样本有2个压缩包,一个是.genes.fpkm,另一个是.counts.genes我就直接选择后面的数据合并。
把曾老师的代码修修改改就得到了表达矩阵和临床信息
#数据下载
rm(list = ls())
options(stringsAsFactors = F)
list.files('countsFiles/')
a=read.table('countsFiles/GSM2711785_WT1.counts.genes.txt.gz',
header = T,sep = '\t',row.names = 1)
head(a)
a=do.call(cbind,lapply(list.files('countsFiles/'), function(x){
read.table(file.path('countsFiles',x),
header = T,sep = '\t',row.names = 1)[,1]
}))
a[1:4,1:4]
rownames(a)=rownames(read.table('countsFiles/GSM2711785_WT1.counts.genes.txt.gz',
header = T,sep = '\t',row.names = 1))
colnames(a)=gsub('.counts.genes.txt.gz','',list.files('countsFiles/'))
library(stringr)
a1=read.table("sample")
rownames(a1)=a1[,1]
sample_name=a1[,-1]
names(sample_name)=rownames(a1)
group=as.data.frame(sample_name)
k1=str_detect(group$sample_name,"RKO-WT")
k2=str_detect(group$sample_name,"RKO-PRDM1-KO2")
k3=str_detect(group$sample_name,"RKO-PRDM1-KO5")
k4=str_detect(group$sample_name,"RKO-GFP-OE")
k5=str_detect(group$sample_name,"RKO-PRDM1α-OE")
group$subtype=ifelse(k1,"RKO-WT",ifelse(k2,"RKO-PRDM1-KO2",
ifelse(k3,"RKO-PRDM1-KO5",
ifelse(k4,"RKO-GFP-OE",
ifelse(k5,"RKO-PRDM1α-OE","RKO-PRDM1β-OE")))))
colnames(a)=rownames(group)
a[1:4,1:4]
boxplot(a)
exprSet=a
boxplot(log2(edgeR::cpm(exprSet)+1),las=2)
print(dim(exprSet))
exprSet=exprSet[apply(exprSet,1,
function(x) sum(x>1) > floor(ncol(exprSet)/10)),]
print(dim(exprSet))
library(stringr)
colnames(exprSet)
subtype=group$subtype
table(subtype)
k1=str_detect(group$subtype,"WT")
k2=str_detect(group$subtype,"-OE")
group$group_list=ifelse(k1,"WT",ifelse(k2,"OE","KO"))
group_list=group$group_list
table(group_list)
colD=group[,-1]
colnames(colD)=c("group_list","subtype")
table(colD)
save(exprSet,colD,file = 'input.Rdata')
#(1)提取表达矩阵exp
exp <- exprSet
exp[1:4,1:4]
exp = log2(exp+1)
exp[1:4,1:4]
#(2)提取临床信息
pd <- colD
#(3)调整pd的行名顺序与exp列名完全一致
p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
这里面有2个坑,一个是分组获取"WT","OE","KO"过程中用ifelse函数的时候先进行WT和"-OE"的判断,不然容易出错。另一个是一开始我弄错了分类,后面又重新赋值了,懒得改,反正记录下来,以后就知道了。
得到验证过的表达矩阵信息就好办了下面就是绘图了。
同样是套用代码
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load(file = 'input.Rdata')
# 每次都要检测数据
dat=log2(edgeR::cpm(exprSet)+1)
dat[1:4,1:4]
group_list=colD$subtype
table(group_list)
cg=names(tail(sort(apply(dat,1,sd)),1000))#apply按行('1'是按行取,'2'是按列取)取每一行的方差,从小到大排序,取最大的1000个
library(pheatmap)
pheatmap(dat[cg,],show_colnames =F,show_rownames = F) #对那些提取出来的1000个基因所在的每一行取出,组合起来为一个新的表达矩阵
n=t(scale(t(dat[cg,]))) # 'scale'可以对log-ratio数值进行归一化
n[n>2]=2
n[n< -2]= -2
n[1:4,1:4]
pheatmap(n,show_colnames =F,show_rownames = F)
ac=data.frame(subtype=group_list)
rownames(ac)=colnames(n) #把ac的行名给到n的列名
pheatmap(n,show_colnames =F,show_rownames = F,
annotation_col=ac,filename = 'heatmap_top1000_sd.png')
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load(file = 'input.Rdata')
colnames(exprSet)
pheatmap::pheatmap(cor(exprSet))
# 组内的样本的相似性应该是要高于组间的!
pheatmap::pheatmap(cor(exprSet),
annotation_col = colD,
show_rownames = F,
filename = 'cor_all.png')
dim(exprSet)
exprSet=exprSet[apply(exprSet,1, function(x) sum(x>1) > 5),]
dim(exprSet)
exprSet=log(edgeR::cpm(exprSet)+1)
dim(exprSet)
exprSet=exprSet[names(sort(apply(exprSet, 1,mad),decreasing = T)[1:500]),]
dim(exprSet)
M=cor(log2(exprSet+1))
pheatmap::pheatmap(M,annotation_col = colD)
pheatmap::pheatmap(M,
show_rownames = F,
annotation_col = colD,
filename = 'cor_top500.png')
这就得到了heatmap_top1000_sd和cor_top500的图如下:
还可以套用今天的娟老师的配色来一波
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
library(pacman)
p_load(pheatmap)
library(getopt)
library(pheatmap)
library(RColorBrewer)
load(file = 'input.Rdata')
# 每次都要检测数据
dat=log2(edgeR::cpm(exprSet)+1)
dat[1:4,1:4]
group_list=colD$subtype
table(group_list)
cg=names(tail(sort(apply(dat,1,sd)),1000))#apply按行('1'是按行取,'2'是按列取)取每一行的方差,从小到大排序,取最大的1000个
library(pheatmap)
pheatmap(dat[cg,],show_colnames =F,show_rownames = F) #对那些提取出来的1000个基因所在的每一行取出,组合起来为一个新的表达矩阵
n=t(scale(t(dat[cg,]))) # 'scale'可以对log-ratio数值进行归一化
n[n>2]=2
n[n< -2]= -2
n[1:4,1:4]
pheatmap(n,show_colnames =F,show_rownames = F)
ac=data.frame(subtype=group_list)
rownames(ac)=colnames(n) #把ac的行名给到n的列名
pheatmap(n,show_colnames =F,show_rownames = F,
annotation_col=ac,filename = 'heatmap_top1000_sd.png')
##------color----
#默认色系:红黄蓝
color.1 <- colorRampPalette(rev(brewer.pal(n = 7, name ="RdYlBu")))(100)
#红黑绿
color.2 <- colorRampPalette(rev(c("#ff0000", "#000000", "#00ff00")))(1000)
#红白蓝
color.3 <- colorRampPalette(rev(c("#ff0000", "#ffffff", "#0000ff")))(100)
color.set <- list(color.1, color.2,color.3)
pheatmap(n,scale = "row",color = color.1,show_colnames =F,show_rownames = F,
annotation_col=ac,filename = 'heatmap_top1000_sd1.png')
pheatmap(n,scale = "row",color = color.2,show_colnames =F,show_rownames = F,
annotation_col=ac,filename = 'heatmap_top1000_sd2.png')
pheatmap(n,scale = "row",color = color.3,show_colnames =F,show_rownames = F,
annotation_col=ac,filename = 'heatmap_top1000_sd3.png')
可以得到另外2种配色的图:
这个红白蓝的就很讨人喜欢。以后如有需要就直接套用了。