参考学习GEO提供的GEO2R在线差异分析工具使用的代码流程。笔记里的代码在理解每一步的基础上,部分做了修改。但大致思路不变,即使用limma包对芯片测序数据进行差异分析(暂未涉及可视化的代码)。
示例数据
- GSE:GSE43760
- Selected group design:
(1)Disease:"GSM1070439","GSM1070441", "GSM1070443", "GSM1070445", "GSM1070447", "GSM1070449"
(2)Normal:"GSM1070451", "GSM1070453", "GSM1070455", "GSM1070457", "GSM1070459", "GSM1070461"
分析流程Pipeline
Step 1 :下载表达矩阵等
library(GEOquery)
GSE=paste0("GSE","43760")
gset=getGEO(GSE, getGPL = F)
class(gset)
#"list"
length(gset)
#1
- 如上有两个注意点
(1)设置不下载GPL注释信息,因为此前以专门整理了该GPL平台的基因注释信息,节省时间。详见之前得到笔记https://www.jianshu.com/p/d89a25d43549;
(2)初步下载的GSE对象格式之所以是list,是因为同一个GSE的samples可能分别在不同平台(gpl)测序结果,对应list里的不同元素组成。一般来说都是一个GSE对应一个GPL,但遇到多平台情况,需要注意自己选择的样本测序结果是什么平台,再进行选择。
gset_sub=gset[[1]]
gset_sub
# ExpressionSet (storageMode: lockedEnvironment)
# assayData: 32968 features, 24 samples
# element names: exprs
# protocolData: none
# phenoData
# sampleNames: GSM1070439 GSM1070440 ... GSM1070462 (24 total)
# varLabels: title geo_accession ... treatment:ch1 (41 total)
# varMetadata: labelDescription
# featureData: none
# experimentData: use 'experimentData(object)'
# pubMedIds: 23771909
# Annotation: GPL6244
# Get GPL ID
gpl=unique(gset_sub@phenoData@data[,"platform_id"])
[1] "GPL6244"
#Get expression matrix
expr=gset_sub@assayData$exprs
#[1] 32968 24
#Get sample meta(optional)
meta=gset_sub@phenoData@data[,grep(":ch1",colnames(gset_sub@phenoData@data))]
#[1] "disease state:ch1" "gender:ch1" "individual:ch1"
#[4] "time:ch1" "tissue:ch1" "treatment:ch1"
Step 2:表达矩阵与分组设计
- 首先需要核对下,表达数据是否已经经过log转换;如果没有则log转换下。GEO2R通过下述代码实现自动检测。
qx <- as.numeric(quantile(expr, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0)
if (LogC) { expr[which(expr <= 0)] <- NaN
expr <- log2(expr) }
- 然后根据选择的分组取表达矩阵的子集
GSM_case=c("GSM1070439","GSM1070441", "GSM1070443", "GSM1070445", "GSM1070447", "GSM1070449")
GSM_normal=c("GSM1070451", "GSM1070453", "GSM1070455", "GSM1070457", "GSM1070459", "GSM1070461")
index_case=which(colnames(expr) %in% GSM_case)
index_control=which(colnames(expr) %in% GSM_normal)
expr_sle=expr[,c(index_case,index_control)]
- 分组设计
groups=factor(c(rep(1,length(index_case)),rep(0,length(index_control))),
levels=c(0,1),labels = c("normal","disease"))
Step 3:limma包差异分析
library(limma)
#首先设置分组矩阵,指明所有sample的分组情况
design=model.matrix(~0+groups)
colnames(design) <- levels(groups)
# disease normal
# 1 0 1
# 2 0 1
# 3 0 1
# 4 0 1
# 5 0 1
# 6 0 1
# 7 1 0
# 8 1 0
# 9 1 0
# 10 1 0
# 11 1 0
# 12 1 0
#然后根据分组进行线性拟合
fit <- lmFit(expr_sle, design)
#需要进一步指明差异分析的比较为 disease组 VS normal组
cts <- paste("disease", "normal", sep="-") #注意顺序:前者比上后者
cont.matrix <- makeContrasts(contrasts=cts, levels=design) #比较矩阵,指明哪两组进行比较
fit2 <- contrasts.fit(fit, cont.matrix)
#最后得到差异分析结果
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="p",number=Inf)
head(tT)
# logFC AveExpr t P.Value adj.P.Val B
# 8107769 -0.8016667 9.627500 -6.764851 1.062535e-05 0.1905344 2.556737
# 7892531 -0.6966667 7.201667 -6.207964 2.611828e-05 0.1905344 1.959251
# 7894928 -0.7716667 6.700833 -6.123500 3.004739e-05 0.1905344 1.863787
# 7893992 -0.9983333 7.504167 -6.065703 3.309032e-05 0.1905344 1.797715
# 8012349 -1.3166667 8.798333 -5.915814 4.258809e-05 0.1905344 1.623514
# 8059996 -0.9233333 7.895000 -5.890448 4.445985e-05 0.1905344 1.593624
deg <- data.frame(Probe=rownames(tT),P.Value=tT[,"P.Value"],
logFC=tT[,"logFC"],FDR=tT[,"adj.P.Val"])
Step 4:结合GPL平台信息注释symbo基因名
gpl_anno=read.csv(paste0("/your path/to/GPL_array/",gpl,".csv"))
gpl_anno[,1]=as.character(gpl_anno[,1])
deg=dplyr::left_join(deg,gpl_anno[,-3],by=c("Probe"="ID"))
deg=na.omit(deg)
rownames(deg)=1:nrow(deg)
# Probe logFC P.Value FDR Symbol
# 1 8107769 -0.8016667 -0.8016667 0.1905344 SLC12A2
# 2 8012349 -1.3166667 -1.3166667 0.1905344 PER1
# 3 8059996 -0.9233333 -0.9233333 0.1905344 PER2
# 4 7899018 -0.4683333 -0.4683333 0.1905344 TMEM57
# 5 7937696 0.7750000 0.7750000 0.2157829 KRTAP5-AS1
# 6 7897449 -0.4866667 -0.4866667 0.2164292 SPSB1
- 以上是针对GEO2R里结合limma对芯片数据分析的方法;
- 如果是RNAseq的counts数据,则需要结合
edgeR
包的voom()
函数,总体思路不变(不需要考虑GPL的基因名注释问题了),相关代码如下。 -
示例数据:GSE75852
#加载包
library(limma)
library(edgeR)
library(GEOquery)
library(data.table)
#下载数据
count_link="https://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75852/suppl/GSE75852_gene_count_frags.txt.gz"
# https://ftp.ncbi.nlm.nih.gov/geo/series/GSE112nnn/GSE112348/suppl/
counts=as.data.frame(fread(count_link))
colnames(counts)
counts[1:4,1:4]
#group分组
gset=getGEO("GSE75852",getGPL = F)
gsms=gset[[1]]@phenoData@data[,grep(":ch1",colnames(gset[[1]]@phenoData@data))]
index_case=which(gsms[,1]=="induced pluripotent stem cell" &
gsms[,2]=="Ataxia Telangiectasia")
index_control=which(gsms[,1]=="induced pluripotent stem cell" &
gsms[,2]=="Normal")
counts_sub=data.frame(counts[,c(index_case+1,index_control+1)],row.names=counts[,1])
groups=factor(c(rep(1,length(index_case)),rep(0,length(index_control))),
levels=c(0,1),labels = c("normal","disease"))
design=model.matrix(~0+groups)
colnames(design) <- levels(groups)
#limma+voom差异分析
v <- voom(counts_sub,design,normalize="quantile")
fit <- lmFit(v,design)
cts <- paste("disease", "normal", sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="p",number=Inf)
deg <- data.frame(Probe=rownames(tT),P.Value=tT[,"P.Value"],
logFC=tT[,"logFC"],FDR=tT[,"adj.P.Val"],AveExpr=tT[,"AveExpr"])
head(deg)