RobustRankAggreg分析胃癌差异基因

目的：下载GSE19826、GSE33335、GSE56807、GSE63089这四个GSE数据集，分别在每个数据集里求出差异表达基因，最后用RobustRankAggreg方法寻找共同差异表达的基因。

四个数据集

一.处理GSE19826

1.下载原始矩阵文件

在GEO里输入GSE19826，下载矩阵文件series matrix file：

series matrix file

将下载的文件命名为GSE19826.txt。复制GSE19826.txt为GSE19826.xls，用excel打开，下滑：

67行以后

可以看到67行以后是探针矩阵信息。复制67行以后的部分，另存为probeMatrix.txt

2.下载平台注释文件

GSE19826的平台为GPL570，点开后点击download full table进行下载：

download full table

将下载文件命名为ann.txt；
复制ann.txt为ann.xls，用excel打开：

第11列

可以看到第11列就是基因名称。

3.运行perl脚本

运行perl脚本probe2symbol.pl，得到基因表达矩阵geneMatrix.txt，记得输入的数字是11(因为平台注释文件第11列是基因名称)：

perl脚本

4.使用limma得到差异表达基因

将基因表达矩阵geneMatrix.txt作为输入，使用limma得到差异表达基因

logFoldChange=2
adjustP=0.05
library(limma)
library("impute")

#读取文件
rt=read.table("input.txt",sep="\t",header=T)
 #这个input.txt文件是 1.download里的基因表达矩阵geneMatrix.txt，
 #也就是GSE33335的基因表达矩阵

#将表格转化为矩阵
rt=as.matrix(rt)
rt[1:3,1:3]
rownames(rt)=rt[,1]#将rt的行名变为基因名
rt[1:3,1:3]
exp=rt[,2:ncol(rt)]
exp[1:3,1:3]
#看exp与rt的区别，学会这种转化方法
dimnames=list(rownames(exp),colnames(exp))
rt=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
rt[1:3,1:3]

#填补缺失值
mat=impute.knn(exp)
rt=mat$data

#求均值
rt=avereps(rt)

#校正
pdf(file="rawBox.pdf")
boxplot(rt,col = "blue",xaxt = "n",outline = F)
dev.off()
rt=normalizeBetweenArrays(as.matrix(rt))
pdf(file="normalBox.pdf")
boxplot(rt,col = "red",xaxt = "n",outline = F)
dev.off()

#自动判断是否需要进行log化
qx <- as.numeric(quantile(rt, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)

if (LogC) { 
  rt[which(rt <= 0)] <- NaN
  rt <- log2(rt)
  print("log2 transform finished")
}else{
  print("log2 transform not needed")
}


#接下来用limma做差异分析：共三个步骤 lmFit()  eBayes()  topTable()

#1.加入分组信息 rt是GSE19826的表达矩阵，共27列
dim(rt)
group_list <- c("control","cancer","control","cancer","control","cancer",
                "control","cancer","control","cancer","control","cancer",
                "control","cancer","control","cancer","control","cancer",
                "control","cancer","control","cancer","control","cancer",
                rep("control",3))

#2.构建实验设计矩阵design
design <- model.matrix(~0+factor(group_list))
colnames(design) <- levels(factor(group_list))#design的列名
rownames(design)=colnames(rt)                 #design的行名
design##注意：这里设计出design以后，一定要和GEO里的网页对应一下，
      ##例如这里design显示GSM495074是cancer组，回到GEO网页对应GSM495074是cancer组

#3.构建对比模型，比较两个实验条件下表达数据
contrast.matrix<-makeContrasts(cancer-control,levels = design)
contrast.matrix##这个矩阵声明，我们要把cancer组跟control组进行差异分析比较

#4.差异分析
##step1 线性模型拟合
fit <- lmFit(rt,design)
##step2 根据对比模型进行差值计算
fit2 <- contrasts.fit(fit, contrast.matrix)
##step3 贝叶斯检验
fit2 <- eBayes(fit2)
##step4 生成基因的检验结果报告
allDiff=topTable(fit2,adjust='fdr',number=200000)
write.table(allDiff,file="limmaTab.xls",sep="\t",quote=F)
##step5 按log(foldchange)排序,为RRA做准备
allLimma=allDiff
allLimma=allLimma[order(allLimma$logFC),]
head(allLimma)
allLimma=rbind(Gene=colnames(allLimma),allLimma)
head(allLimma)
write.table(allLimma,file="limmaTab.txt",sep="\t",quote = F,col.names = F)

#得到的limmaTab.txt可用于RRA分析

二.处理GSE33335

1.下载原始矩阵文件

在GEO里输入GSE33335，下载矩阵文件series matrix file，将下载的文件命名为GSE33335.txt。复制GSE33335.txt为GSE33335.xls，用excel打开，复制74行以后为探针矩阵矩阵probeMatrix.txt。

2.下载平台注释文件

下载GPL5175这个平台的注释文件为ann.txt，复制为ann.xls用excel打开：

GPL5175

可以看到与GSE19826不同的是，这里没有genesymbol这一列，因此我们使用GB_LIST这一列获取基因表达矩阵。

3.运行perl脚本

step1:将探针转换为genebank id
所需内容：注释文件ann.txt、探针矩阵文件probeMatrix.txt、脚本probe2symbol.pl：

perl脚本

得到geneMatrix.txt（此时输入的数值为2，因为GB_LIST在第2列）。geneMatrix.txt的行名为genebankid，列名为样品名：

geneMatrix.txt

step2将genebank id转化为基因名
所需内容：step1得到的geneMatrix.txt、genebankid对应人基因名的文件gene2accession.txt(已整理好，可从NCBI下载)、脚本gb2symbol.pl，得到基因表达矩阵geneMatrix2.txt.

4-5.同一

以上可以看出，从GEO里用GSE编码获取基因表达矩阵有两种方法。第一种是平台注释文件里有genesymbol这一列，第二种是平台注释文件里没有genesymbol，但是有GB_LIST。两种方法最终都可得到基因表达矩阵。

三.处理GSE56807

四.处理GSE63089

五.用RobustRankAggreg进行分析

RRA

六.画图验证


rm(list=ls())
padj=0.05 #过滤的p值
logFC=1

files=c("GSE19826.txt","GSE33335.txt","GSE56807.txt","GSE63089.txt")
upList=list()
downList=list()
allFCList=list()

#1.读入4个GSE数据集
for(i in 1:length(files)){
  inputFile=files[i]
  rt=read.table(inputFile,header = T)
  header=unlist(strsplit(inputFile,"\\."))
  downList[[header[1]]]=as.vector((rt[,1])) #第一列
  upList[[header[1]]]=rev(as.vector(rt[,1]))#第一列逆序
  fcCol=rt[,1:2]
  colnames(fcCol)=c("Gene",header[[1]])
  allFCList[[header[1]]]=fcCol
}

#2.合并所有的FC值
mergeLe=function(x,y){
  merge(x,y,by="Gene",all=T)}
newTab=Reduce(mergeLe,allFCList)
rownames(newTab)=newTab[,1]
newTab=newTab[,2:ncol(newTab)]
newTab[is.na(newTab)]=0

#3.获取上调基因
library(RobustRankAggreg)
upMatrix = rankMatrix(upList)
upAR = aggregateRanks(rmat = upMatrix)
colnames(upAR)=c("Name","Pvalue")
upAdj=p.adjust(upAR$Pvalue,method = "bonferroni")#校正p值
upXls=cbind(upAR,adjPvalue=upAdj)
upFC=newTab[as.vector(upXls[,1]),]
upXls=cbind(upXls,logFC=rowMeans(upFC))
#write.table(upXls,file="up.xls",sep="\t",quote=F,row.names = F)
upSig=upXls[(upXls$adjPvalue<padj & upXls$logFC>logFC),]
#write.table(upSig,file="upSig.xls",sep="\t",quote=F,row.names = F)#显著上调基因

#3.获取下调基因
downMatrix = rankMatrix(downList)
downAR = aggregateRanks(rmat=downMatrix)
colnames(downAR)=c("Name","Pvalue")
downAdj=p.adjust(downAR$Pvalue,method = "bonferroni")
downXls=cbind(downAR,adjPvalue=downAdj)
downFC=newTab[as.vector(downXls[,1]),]
downXls=cbind(downXls,logFC=rowMeans(downFC))
#write.table(downXls,file="down.xls",sep = "\t",quote=F,row.names = F)
downSig=downXls[(downXls$adjPvalue<padj & downXls$logFC< -logFC),]
#write.table(downSig,file="downSig.xls",sep = "\t",quote = F,row.names=F)#显著下调基因

#得到上下调基因后可做GO、KEGG分析和蛋白互作网络

#5.画热图
hminput=newTab[c(as.vector(upSig[1:20,1]),as.vector(downSig[1:20,1])),]#各取上下调20个基因画图
library(pheatmap)
tiff(file="logFC.tiff",width = 15,height = 20,units ="cm",compression="lzw",bg="white",res=400)
pheatmap(hminput,display_numbers = TRUE,fontsize_row=10,fontsize_col=12,
         color = colorRampPalette(c("green", "white", "red"))(50),
         cluster_cols = FALSE,cluster_rows = FALSE, )
dev.off()

############################################################################
#6.以上用RRA法求出两个不同数据集的差异基因，接下来进行作图验证

#加载R包
library(limma) 
library(ggplot2)
library(ggpubr)

#加载GSE63089的表达矩阵rt
load(file="GSE63089exp.Rdata")

#构建分组信息
group <- c(rep("cancer",45),rep("control",45))
group <- factor(group,levels = c("cancer","control"),ordered = F)

#对表达矩阵和分组信息进行合并
dd <- data.frame(group=group,t(rt))
dim(dd)
dd[1:90,1:5]#和GEO原网页对应

#构建比较对象
my_comparisons <- list(
  c("cancer", "control")
)

#画图
ggboxplot(
  dd, x = "group", y = "CST1", #挑选胃癌里高表达基因CST1
  color = "group", palette = c("#00AFBB", "#E7B800"),
  add = "jitter"
) + stat_compare_means(comparisons = my_comparisons, method = "t.test")
##如果这里想让control组在左边，只需在group <- factor处把control放在前面即可，
##这就是factor()的用处

#再画一个胃癌里低表达基因
ggboxplot(
  dd, x = "group", y = "VSIG2", #挑选胃癌里低表达基因VSIG2
  color = "group", palette = "jama",
  add = "jitter"
) + stat_compare_means(comparisons = my_comparisons, method = "t.test")

VSIG2

Rplot02.png

通过六，学会保存Rdata和加载Rdata的方法

七.差异基因后续分析

参考GEO RRA\GSE33335\03.limma里的limma_tdy.R