保姆式GEO数据挖掘演示
配对GEO数据分析
GEO数据挖掘系列文
TCGAbiolinks做差异分析
我的一个paired DEGs 的例子
rm(list=ls())
suppressMessages(library(limma))
suppressMessages(library("plyr"))
data_dir<-"your_path"
setwd(data_dir)
#####download data
if(T){
suppressMessages(library(GEOquery))
gset <- getGEO('GSE76161', destdir=".",
AnnotGPL = T,
getGPL = T)
save(gset,file=paste0(human_dir,'/GSE76161_eSet.Rdata'))
}
load(paste0(human_dir,'/GSE76161_eSet.Rdata'))
###Get gene expression and phenotype data
b = gset[[1]]
exprSet=exprs(b)
###get GPL information and clean the data
GPL = b@featureData@data
colnames(GPL)
ids = GPL[,c(1,3)]
ids = ids[ids[,2] != '',]
a<-strsplit(as.character(ids[,2]), "///")
tmp <- mapply(cbind, ids[,1],a)
df <- ldply (tmp, data.frame)
ID2gene = df
colnames(ID2gene) = c("id", "gene")
exprSet = exprSet[rownames(exprSet) %in% ID2gene[,1],]
ID2gene = ID2gene[match(rownames(exprSet), ID2gene[,1]),]
dim(ID2gene)
dim(exprSet)
tail(sort(table(ID2gene[,2])), n = 12L)
{
MAX = by(exprSet, ID2gene[,2],
function(x) rownames(x)[ which.max(rowMeans(x))])
MAX = as.character(MAX)
exprSet = exprSet[rownames(exprSet) %in% MAX,]
rownames( exprSet ) = ID2gene[ match( rownames( exprSet ), ID2gene[ , 1 ] ), 2 ]
}
exprSet[1:5,1:5]
pdata = pData(b)
colnames(pdata)
save(exprSet, pdata, file = paste0(human_dir,'/finalSet.Rdata'))
###DEG analysis
#build the matrix
SibShip <- factor(pdata$characteristics_ch1.3,levels=c("patient_id: 3","patient_id: 4","patient_id: 5"))
Treat<-factor(pdata[,47],levels=c("vehicle (0.1% DMSO)","1 uM OCA (INT-747)"))
design <- model.matrix(~0 + (SibShip+Treat))
rownames(design) = colnames(exprSet)
fit1 <- lmFit(exprSet, design)
#fit2 <- contrasts.fit(fit1, design)
fit2 <- eBayes(fit1)
nrDEG = topTable( fit2,
adjust.method="BH",
coef = "Treat1 uM OCA (INT-747)",
n = nrow(fit2)
)
nrDEG<-nrDEG[order(nrDEG$P.Value),]
write.csv(nrDEG,paste0(data_dir,"/GSE76161_human_DEGs_INT-747_vs_DMSO.csv"))