富集分析:GO、KEGG
id转换:bitr()
输入数据:差异基因的entrezid;
所有基因的entrezid
KEGG
GO
处理GEO数据
代码分析流程
#数据下载
rm(list = ls())
options(stringsAsFactors = F)
library(GEOquery)
gse_number = "GSE56649"
eSet <- getGEO(gse_number, #本地有文件的话先加载本地文件,本地没有再从网页下载
destdir = '.', #从当前目录读取
getGPL = F) #不下载GPL注释
class(eSet)
length(eSet)
eSet = eSet[[1]]
#(1)提取表达矩阵exp
exp <- exprs(eSet) #exprs()用来提取表达矩阵
exp[1:4,1:4]
exp = log2(exp+1) #若表达矩阵已经取log,则不需要这行代码
boxplot(exp)
#(2)提取临床信息
pd <- pData(eSet) #用pData()提取临床信息
#(3)调整pd的行名顺序与exp列名完全一致
p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))] #调整exp的列名
#(4)提取芯片平台编号
gpl_number <- eSet@annotation
save(gse_number,pd,exp,gpl_number,file = "step1output.Rdata")
*探针注释:
(1)多个探针对应一个基因:按照基因去重复
(2)一个探针对应多个基因:非特异性探针
# Group(实验分组)和ids(探针注释)
rm(list = ls())
load(file = "step1output.Rdata")
library(stringr)
# 1.Group----
# 第一类,有现成的可以用来分组的列
if(F) Group = pd$`disease state:ch1`
#第二类,自己生成
if(F){
Group=c(rep("RA",times=13),
rep("control",times=9))
}
rep(c("RA","control"),times = c(13,9))
#第三类,匹配关键词,自行分类
Group=ifelse(str_detect(pd$source_name_ch1,"control"),"control","RA")
#设置参考水平,指定levels,对照组在前,处理组在后,一定到设定
Group = factor(Group,
levels = c("control","RA"))
Group
# 注意levels与因子内容必须对应一致
# Group = pd$`disease state:ch1`
# Group = factor(Group,
# levels = c("healthy control","rheumatoid arthritis"))
#2.ids----------找到探针名和基因名对应的数据框
#方法1 BioconductorR包(最常用)
gpl_number
#http://www.bio-info-trainee.com/1399.html
if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids <- toTable(hgu133plus2SYMBOL) # toTable()是变成数据框,转变为探针 ID 的注释矩阵
head(ids)
# 方法2 读取GPL平台的soft文件,按列取子集
##https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL570
if(F){
#注:soft文件列名不统一,活学活用,有的GPL平台没有提供注释,如GPL16956
a = getGEO(gpl_number,destdir = ".")
b = a@dataTable@table
colnames(b)
ids2 = b[,c("ID","Gene Symbol")]
colnames(ids2) = c("probe_id","symbol")
ids2 = ids2[ids2$symbol!="" & !str_detect(ids2$symbol,"///"),] #将一个探针对应多个基因的探针以及探针对应空字符串的探针删掉
}
# 方法3 官网下载,文件读取
##http://www.affymetrix.com/support/technical/byproduct.affx?product=hg-u133-plus
# 方法4 自主注释
#https://mp.weixin.qq.com/s/mrtjpN8yDKUdCSvSUuUwcA
save(exp,Group,ids,gse_number,file = "step2output.Rdata")
数据探索—PCA和热图
rm(list = ls())
load(file = "step1output.Rdata")
load(file = "step2output.Rdata")
#输入数据:exp和Group
#Principal Component Analysis
#http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials
# 1.PCA 图----
dat=as.data.frame(t(exp))
library(FactoMineR)
library(factoextra)
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = Group, # color by groups
palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # 若为FALSE时,椭圆消失
legend.title = "Groups"
)
pca_plot
ggsave(plot = pca_plot,filename = paste0(gse_number,"_PCA.png"))
save(pca_plot,file = "pca_plot.Rdata")
# 2.top 1000 sd 热图----
cg=names(tail(sort(apply(exp,1,sd)),1000))
n=exp[cg,]
# 直接画热图,对比不鲜明
library(pheatmap)
annotation_col=data.frame(group=Group)
rownames(annotation_col)=colnames(n)
pheatmap(n,
show_colnames =F,
show_rownames = F,
annotation_col=annotation_col
)
# 用标准化的数据画热图,两种方法的比较:https://mp.weixin.qq.com/s/jW59ujbmsKcZ2_CM5qRuAg
## 1.使用热图参数
pheatmap(n,
show_colnames =F,
show_rownames = F,
annotation_col=annotation_col,
scale = "row", #只比较列与列的差别
breaks = seq(-3,3,length.out = 100) #避免离群值的颜色对结果出现偏差
) #breaks 参数解读在上面链接
dev.off()
## 2.自行标准化再画热图
n2 = t(scale(t(n))) #scale()只能按列标准化
pheatmap(n2,
show_colnames =F,
show_rownames = F,
annotation_col=annotation_col,
breaks = seq(-3,3,length.out = 100)
)
dev.off()
# 关于scale的进一步探索:zz.scale.R
# 3.相关性热图----
pheatmap::pheatmap(cor(exp), #列与列之间的相关性
annotation_col = annotation_col)
pheatmap::pheatmap(cor(n),
annotation_col = annotation_col)
pheatmap::pheatmap(cor(n2),
annotation_col = annotation_col
)
dev.off()
关于相关性背后的故事:https://mp.weixin.qq.com/s/IqMW6Qjf64dn30F4RQg5kQ
rm(list = ls())
load(file = "step2output.Rdata")
#差异分析,用limma包来做
#需要表达矩阵和Group,不需要改
library(limma)
design=model.matrix(~Group)
fit=lmFit(exp,design)
fit=eBayes(fit)#贝叶斯检验
deg=topTable(fit,coef=2,number = Inf)
#为deg数据框添加几列
#1.加probe_id列,把行名变成一列
library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg))
head(deg)
#2.加上探针注释
table(!duplicated(ids$probe_id))
table(!duplicated(ids$symbol))
#按symbol列去重,常见标准有3个:最大值/平均值/随机去重
#随机去重,另两个见zz.去重方式.R
ids = ids[!duplicated(ids$symbol),]
deg <- inner_join(deg,ids,by="probe_id")
head(deg)
nrow(deg)
#3.加change列,标记上下调基因
logFC_t=1
P.Value_t = 0.01
k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)
change = ifelse(k1,"down",ifelse(k2,"up","stable"))
deg <- mutate(deg,change)
#4.加ENTREZID列,用于富集分析(symbol转entrezid,然后inner_join)
library(clusterProfiler)
library(org.Hs.eg.db)
s2e <- bitr(deg$symbol,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)#人类
#其他物种http://bioconductor.org/packages/release/BiocViews.html#___OrgDb
dim(deg)
deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL"))
dim(deg)
length(unique(deg$symbol))
save(Group,deg,logFC_t,P.Value_t,gse_number,file = "step4output.Rdata")
三种去重方式:最大值\平均值\随机值
rm(list = ls())
load("step2output.Rdata")
#保留最大值
exp2 = exp[ids$probe_id,]
identical(ids$probe_id,rownames(exp2))
ids = ids[order(rowSums(exp2),decreasing = T),]
ids = ids[!duplicated(ids$symbol),];nrow(ids)
# 拿这个ids去inner_join
#求平均值
rm(list = ls())
load("step2output.Rdata")
exp3 = exp[ids$probe_id,]
exp3[1:4,1:4]
# 有重复的基因,求平均值
s = unique(ids$symbol[duplicated(ids$symbol)]);length(s)
exp4 = sapply(s, function(i){
colMeans(exp3[ids$symbol==i,])
})
dim(exp4)
exp4 = t(exp4)
# 无重复的基因,直接使用
s2 = ids$symbol[!ids$symbol %in% s];length(s2)
length(unique(ids$symbol))
exp5 = rbind(exp4,exp3[ids$symbol %in% s2,]);dim(exp5)
# 此时拿到的exp5已经是一个基因为行名的表达矩阵,直接差异分析,不再需要inner_jpin