一、GEO数据下载

https://www.ncbi.nlm.nih.gov/geo/

限定条件:

Entry type：数据导入类型 , 最常用的是series
Top organisms : Homo sapiens
Study type : 检测类型 , 不同的实验检测类型不一样

检索结果

Status : 使用已经Public的
Platforms : 测序使用的平台
Samples : 每个样本信息
Bioproject :可获得原始fastq文件
SOFT formatted family file : 样本信息
Series Matrix File : 表达矩阵

二、差异分析

一 ) 加载R包获取数据

###1. 加载R包获取数据
library (GEOquery)
gset <- getGEO ("GSE137842" , destdir="." , getGPL = F)  
#gset是一个list     GEO号      下载的位置     是否下载GPL注释文件

###2. 获取表达矩阵
exp <- gset[["GSE137842_series_matrix.txt.gz"]]@assayData$exprs
                                             #assayData储存的是表达数据
###3. 获取分组信息
group <-gset[["GSE137842_series_matrix.txt.gz"]]@phenoData@data
                                                   #表型文件

##此时的exp是行为探针名,列为样本名的matrix,将其转为数据框,利于后续操作
exp <- as.data.frame(exp)
colnames(group)  ##获得group的列名
group1 <-group[,c(8,2)] ##只需要"geo_accession"  "source_name_ch1"两列,分别为第8和2列

二 ) 探针的转换

GPL570 <- getGEO("GPL570",destdir = ".") ##获取注释信息,相当于之前getGPL = TRUE
GPL <- GPL570@dataTable@table  
GPL <- GPL[ , c(1,11)]  ##类似的,只需要"ID" "Gene Symbol"两列,分别为第1和11列
colnames(GPL) <- c("ID","gene")  ##修改将带有空格的列名

现在有了三个文件,分别为表达矩阵exp,注释文件GPL,分组文件group1

两种情况

一个探针对应多个基因 :去除 , 删除带///的行
gpl <- GPL[-grep("///",GPL$gene) , ]
多个探针对应一个基因:去重
去重有多种方式,可随机去重,也根据以下方式取表达量最大的值

exp_1 <- as.data.frame(exp)
library(tibble)
exp_1 <- rownames_to_column (exp_1,var="ID") ##将exp_1的行名转为第一列,列名为ID
exp_1 <-merge(exp_1,gpl,by="ID") ##将两个表exp_1和gbl按共同的一列ID列合并
exp_1 <- select(exp_1 , -"ID")  ##删除"ID"列
exp_1 <- select(exp_1,gene,everything())  ##重新排列,使"gene"在第一列
exp_1 <- mutate(exp_1, rowmean = rowMeans(exp_1[grep("GSM",names(exp_1))]))
##取带有"GSM"的列,计算每行均值,并将其增加为一列,列名rowmean
##(由于除第一列gene以外均为带GSM的列,所以也等价为 
            exp_1 <- mutate(exp_1, rowmean = rowMeans(exp_1[-1]))
exp_1 <- arrange(exp_1,desc(rowmean))  ##按rowmean从大到小排列
exp_1 <- distinct(exp_1,gene,.keep_all = T)  ##删除重复的gene
exp_1 <- select(exp_1 , -rowmean)  ##去除rowname一列
exp_1 <- column_to_rownames(exp_1,colnames(exp_1)[1])  ##将第一列变成行名
exp_1<-exp_1[-1 , ]  ##删除第一行

由于输入参数就是它前一行的输出结果，因此可以使用%>%进行改写:

library(dplyr)
exp_1<- as.data.frame(exp) %>%
  rownames_to_column(var = "ID") %>%
  merge(gpl , by = "ID") %>%
  select(-ID) %>%  
  select(gene , everything()) %>%
  mutate(rowmean = rowMeans(.[grep("GSM",names(.))])) %>%
  arrange(desc(rowmean)) %>%
  distinct(gene , .keep_all = T) %>%
  select(-rowmean) %>%
  tibble::column_to_rownames(colnames(.)[1])
exp_1<-exp_1[-1 , ]  ##删除第一行

三 ) Normalization

boxplot(log10(exp_1) + 1)   ### +1防止出现0值输出NA

此时数据没有归一化

使用limma包进行处理

library(limma)
exp_1 <- log2(exp_1 + 1)
exp_1 <- normalizeBetweenArrays(exp_1)
boxplot(exp_2)

处理完毕

四 ) limma包差异分析

library(limma)
design <- model.matrix(~0 + factor(c(1,1,1,2,2,2,3,3,3))) 
colnames(design) <- c("group1" , "group2" , "group3")
exp_2 <- exp_2[ , -10]   ##删去无用的第十列 Naïve bone样本
fit <- lmFit(exp_2 , design)
contrast.matrix <- makeContrasts(group2-group1 , group3-group2 , group3-group1,levels = design)
fit2 <- contrasts.fit(fit ,contrast.matrix)
fit2 <- eBayes(fit2)
diff <- topTable(fit2 , coef = 1 ,adjust = "BH" , p.value = 1 , number = Inf)
##topTable参数: 
     ##coef = 1: 根据上面makeContrasts的第一个（group2-group1）来提取结果
     ##adjust: 对p值的校正方法，包括："none", "BH", "BY" and "holm"

以GSE32575为例

1.加载R包获取数据

library(GEOquery)
gset <- getGEO("GSE32575", destdir=".",getGPL = F)
exp <- gset[[1]]@assayData$exprs  ##获取表达矩阵
exp <- as.data.frame(exp)  ###转为数据框
exp <- exp[ , -c(1:12)] ##删除不需要的前12列
group <- gset[[1]]@phenoData@data
group <- group[,c(8,2)]

也可用pData函数和exprs函数获取分组信息和表达矩阵

pdata <- pData(gset)
expr <- exprs(gset)

2. Normalization

exp <- log2(exp + 1)  #数据较大,需要对数处理
exp <- normalizeBetweenArrays(exp)
boxplot(exp)  ##图显示数据已经标准化

3. 探针转换与基因去重

gpl <- getGEO("GPL6102",destdir = "." ,)
gpl <- gpl@dataTable@table
gpl <- gpl[ , c(1,13)]   ##只需要"ID" "Gene Symbol"两列,分别为第1和13列

此时表Symbol列有许多空白行,去除

gpl1 <- gpl[-which(gpl$Symbol == "")  , ]

选取平均表达量最高的探针,并去除重复基因

library(dplyr)
exp_1<- as.data.frame(exp) %>%
  rownames_to_column(var = "ID") %>%
  merge(gpl , by = "ID") %>%
  select(-ID) %>%  
  select(Symbol , everything()) %>%
  mutate(rowmean = rowMeans(.[grep("GSM",names(.))])) %>%
  arrange(desc(rowmean)) %>%
  distinct(Symbol  , .keep_all = T) %>%
  select(-rowmean) %>%
  tibble::column_to_rownames(colnames(.)[1])

此时数据变成这个样子

4. limma包差异分析

design <- model.matrix(~ factor(c(rep(1,18),rep(2,18))))
colnames(design) <- c("group1" , "group2")
fit <- lmFit(exp_1 , design)
fit <- eBayes(fit)
diff <- topTable(fit , coef = "group2" ,adjust = "fdr" , p.value = 0.05 , number = Inf)

得到的差异分析结果7682个

5. 作图

group_list <- c(rep('before',18),rep('after',18))
group_list <- factor(group_list)
group_list <- relevel(group_list, ref="before")
data_plot <- as.data.frame(t(exp_1))###需要用t函数,行列转置,使基因名称在列
data_plot <- data.frame(pairinfo=rep(1:18,2),
                       group=group_list,
                       data_plot,stringsAsFactors = F)

挑选了一个基因CAMKK2

library(ggplot2)
ggplot(data_plot, aes(group,CAMKK2,fill=group)) +
  geom_jitter(aes(colour=group), size=2, alpha=0.5)+
  xlab("") +
  ylab(paste("Expression of ","CAMKK2"))+
  theme_classic()+
  theme(legend.position = "none")

3.png

加入配对信息作图

library(ggplot2)
ggplot(data_plot, aes(group,CAMKK2,fill=group)) +
  geom_boxplot() +
  geom_point(size=2, alpha=0.5) +
  geom_line(aes(group=pairinfo), colour="black", linetype="11") +
  xlab("") +
  ylab(paste("Expression of ","CAMKK2"))+
  theme_classic()+
  theme(legend.position = "none")

加入配对信息

对比差异较大的的基因

library(ggplot2)
ggplot(data_plot, aes(group,CH25H,fill=group)) +
  geom_boxplot() +
  geom_point(size=2, alpha=0.5) +
  geom_line(aes(group=pairinfo), colour="black", linetype="11") +
  xlab("") +
  ylab(paste("Expression of ","CH25H"))+
  theme_classic()+
  theme(legend.position = "none")

对比差异较大的的基因如CH25H

批量画出差异最大的8个基因

library(dplyr)
library(tibble)
allDiff_arrange <- diff %>% 
  rownames_to_column(var="genesymbol") %>% 
  arrange(desc(abs(logFC)))
genes <- allDiff_arrange$genesymbol[1:8]

plotlist <- lapply(genes, function(x){
  data <- data.frame(data_plot[,c("pairinfo","group")],gene=data_plot[,x])
  ggplot(data, aes(group,gene,fill=group)) +
    geom_boxplot() +
    geom_point(size=2, alpha=0.5) +
    geom_line(aes(group=pairinfo), colour="black", linetype="11") +
    xlab("") +
    ylab(paste("Expression of ",x))+
    theme_classic()+
    theme(legend.position = "none")
})

library(cowplot)
plot_grid(plotlist=plotlist, ncol=4,labels = LETTERS[1:8])

差异最大的前8个基因

画热图

library(pheatmap)
## 设定差异基因阈值，减少差异基因用于提取表达矩阵
allDiff_pair=topTable(fit,adjust='BH',coef="group2",number=Inf,p.value=0.05,lfc =0.5) 
##提前部分数据用作热图绘制
heatdata <- exp[rownames(allDiff_pair),]
##制作一个分组信息用于注释
annotation_col <- data.frame(group_list)
rownames(annotation_col) <- colnames(heatdata)

#如果注释出界，可以通过调整格子比例和字体修正
pheatmap(heatdata, #热图的数据
         cluster_rows = TRUE,#行聚类
         cluster_cols = TRUE,#列聚类，可以看出样本之间的区分度
         annotation_col =annotation_col, #标注样本分类
         annotation_legend=TRUE, # 显示注释
         show_rownames = F,# 显示行名
         show_colnames = F,# 显示列名
         scale = "row", #以行来标准化，这个功能很不错
         color =colorRampPalette(c("blue", "white","red"))(100))

热图

画火山图

library(ggplot2)
library(ggrepel)
library(dplyr)
data <- topTable(fit,adjust='BH',coef="group2",number=Inf) 
data$significant <- as.factor(data$adj.P.Val<0.05 & abs(data$logFC) > 0.5)
data$gene <- rownames(data)
ggplot(data=data, aes(x=logFC, y =-log10(adj.P.Val),color=significant)) +
  geom_point(alpha=0.8, size=1.2,col="black")+
  geom_point(data=subset(data, logFC > 0.5),alpha=0.8, size=1.2,col="red")+
  geom_point(data=subset(data, logFC < -0.5),alpha=0.6, size=1.2,col="blue")+
  labs(x="log2 (fold change)",y="-log10 (adj.P.Val)")+
  theme(plot.title = element_text(hjust = 0.4))+
  geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
  geom_vline(xintercept = c(0.5,-0.5),lty=4,lwd=0.6,alpha=0.8)+
  theme_bw()+
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),   
        axis.line = element_line(colour = "black")) +
  geom_point(data=subset(data, abs(logFC) > 1),alpha=0.8, size=3,col="green")+
  geom_text_repel(data=subset(data, abs(logFC) > 1), 
                  aes(label=gene),col="black",alpha = 0.8)

火山图

GEO芯片数据分析与画图