这里是佳奥!我们继续非模式种转录组的下游分析。
上篇我们拿到了raw count矩阵,但是我们还缺少很多东西,让我们一步一步来。
1、选择合适方法标准化表达矩阵
2、GO/KEGG的注释文件
如果课题组内有完整的流程,就按照课题组内的方法。我是想独立把全流程走一遍,这个部分的篇幅就会很长,那么我们开始吧。
1 R包安装
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager") ##判断是否存在BiocManager包,不存在的话安装
library(BiocManager)
BiocManager::install(c('airway','DESeq2','edgeR','limma'),ask = F,update = F)
2 检查数据
rm(list = ls())
options(stringsAsFactors = F)
a <- read.table('all.counts.txt', header = T)
meta <- a[,1:6]
exprSet <- a[,7:ncol(a)]
colnames(exprSet)
##重命名列名便于后续两两比对
colnames(exprSet) <- c('C1','C2','C3','Y1','Y2','Y3')
##增加分组信息
group_list <- c('C','C','C','Y','Y','Y')
rownames(exprSet)<-meta$Geneid
##保存矩阵
save(meta,exprSet,group_list,file = 'raw_exprSet.Rdata')
load(file = 'raw_exprSet.Rdata')
##检查样本内相关性和样本间相关性
pheatmap::pheatmap(cor(exprSet))
> dim(exprSet)
[1] 33213 18
#这里初步过滤低表达量基因
exprSet=exprSet[apply(exprSet,1, function(x) sum(x>1) > 6),]
dim(exprSet)
> dim(exprSet)
[1] 25092 18
初步检查了raw count后下一步就是选择合适的标准化方法了。
3 标准化
3.1 标准化之edgeR
##标准化表达矩阵
exprSet=log(edgeR::cpm(exprSet)+1)
#exprSet<-round(exprSet) 取整数
#exprSet <- exprSet[!apply(exprSet,1,function(x){sum(floor(x)==0)>1}),] 过滤0数值
#dim(exprSet)
#[1] 18921 18
> dim(exprSet)
[1] 25092 18
##再挑选top500的MAD值基因
exprSet=exprSet[names(sort(apply(exprSet, 1, mad),decreasing = T)[1:500]),]
dim(exprSet)
M=cor(log2(exprSet+1))
tmp=data.frame(g=group_list)
rownames(tmp)=colnames(M)
pheatmap::pheatmap(M,annotation_col = tmp)
# 现在就可以看到,组内样本很好的聚集在一起
# 组内的样本的相似性是要高于组间
pheatmap::pheatmap(M,annotation_col = tmp)
pheatmap(scale(cor(log2(exprSet+1))))
save(meta,exprSet,group_list,file = 'edgeRclean_exprSet.Rdata')
#以上代码,就是为了检查数据集exprSet里面的表达矩阵和表型信息
3.2 标准化之DESeq2
DESeq2包内置归一化算法,因此输入的矩阵必须为raw count。FPKM count和log计算后的矩阵都无法导入DESeq2包进行后续的差异分析。
3.3 标准化之FPKM(TPM、CPM)
这个是实验室内最多使用的标准化方法吧。这里是使用GFT文件的注释信息来计算FPKM矩阵。
library(tidyverse)
#读gtf文件,计算所有外显子的长度
gtf <- read_tsv("hg38.gtf", comment="#", col_names=c('chr','source','type','start','end','score','strand','phase','attributes')) %>% filter(type=='exon') %>% mutate(len = end - start + 1) %>% select(start, end, attributes,len)
#计算基因的非冗余外显子的长度,即获得有效基因长度
gtf$attributes %>% str_extract(., "gene_id \"[\\w|\\.]+") %>% str_remove(., "gene_id \"") -> gtf$gene_id
gtf %>% select(start, end, gene_id, len) %>% distinct(start,end,gene_id, .keep_all = T) %>% select(gene_id,len) %>% group_by(gene_id) %>% summarise(est_len=sum(len)) -> gtf
#整理数据
a <- read.table('all.counts.txt', header = T)
colnames(gtf) <- c('Geneid','est_len')
#读取count的表达量矩阵,列名为gene_id和count,其中gene_id是和gtf文件一致的,然后和刚才计算得到的有效基因长度合并
expmat <- read.table(file = 'all.counts.txt',header = T) %>% inner_join(gtf , by = 'Geneid' ) %>% drop_na()
#各种转换的方法
countToTpm <- function(counts, effLen)
{
rate <- log(counts) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
countToFpkm <- function(counts, effLen)
{
N <- sum(counts)
exp( log(counts) + log(1e9) - log(effLen) - log(N) )
}
fpkmToTpm <- function(fpkm)
{
exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}
countToCPM <- function( counts)
{
N <- sum(counts)
exp( log(counts) + log(1e6) - log(N) )
}
colnames(expmat) <- c("Geneid","Chr","Start","End","Strand","Length","count","est_len")
expmat %>%
mutate( FPKM = countToFpkm(.$count, .$est_len) ) %>% #转FPKM
mutate( TPM = countToTpm(.$count, .$est_len) ) %>% #转TPM
mutate( CPM = countToCPM(.$count) ) %>% #转CPM
select(-est_len) %>% write_tsv("out.xls") #输出结果
一次计算一列数值,多次算完在excel内合并即可。
4 DEG差异分析(DESeq2)+火山图绘制
rm(list = ls())
options(stringsAsFactors = F)
load(file = 'raw_exprSet.Rdata')
group_list
##表达矩阵去除0值
newexprSet <- exprSet[!apply(exprSet,1,function(x){sum(floor(x)==0)>1}),]
##查看数据分布
hist(colSums(newexprSet))
##归一化表达矩阵
NM_exprSet <- t(scale(t(newexprSet)))
##差异分析
##DESeq_DEG 表达矩阵需要raw count值,因此newexprSet
suppressMessages(library(DESeq2))
(colData <- data.frame(row.names=colnames(newexprSet), group_list=group_list) )
dds <- DESeqDataSetFromMatrix(countData = newexprSet,
colData = colData,
design = ~ group_list)
dds <- estimateSizeFactors(dds) ##计算归一化
dds <- DESeq(dds)
res <- results(dds,
contrast=c("group_list","C","Y"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
DEG=as.data.frame(resOrdered)
##去除na
DEG <- na.omit(DEG)
nrDEG=DEG
DEseq_DEG=nrDEG
##绘制热图
library(pheatmap)
choose_gene=head(rownames(DEG),100) ## 50 maybe better
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix)
##绘制火山图
logFC_cutoff <- with(DEG,mean(abs( log2FoldChange)) + 2*sd(abs( log2FoldChange)) )
# logFC_cutoff=1
DEG$change = as.factor(ifelse(DEG$pvalue < 0.05 & abs(DEG$log2FoldChange) > logFC_cutoff,
ifelse(DEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'NOT')
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
library(ggplot2)
g = ggplot(data=DEG,
aes(x=log2FoldChange, y=-log10(pvalue),
color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
ggsave(g,filename = paste0(n,'_volcano.png'))
}
##Dispersion plot
plotDispEsts(dds, main="Dispersion plot")
##
png("DESeq2_RAWvsNORM.png",height = 800,width = 800)
par(cex = 0.7)
n.sample=ncol(exprSet)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
par(mfrow=c(2,2))
boxplot(exprSet, col = cols,main="expression value",las=2)
boxplot(exprMatrix_rlog, col = cols,main="expression value",las=2)
hist(as.matrix(exprSet))
hist(exprMatrix_rlog)
5 PCA图绘制
rm(list = ls())
options(stringsAsFactors = F)
load(file = 'clean_exprSet.Rdata')
##PCA 每次都要检测数据
dat=exprSet
dat[1:4,1:4]
dim(dat)
##step1:准备数据
##下面是画PCA的必须操作,需要看说明书。
dat=t(dat)
dat=as.data.frame(dat)
dat=cbind(dat,group_list)
# dat$group_list=group_list
##step2:画图
library("FactoMineR")
library("factoextra")
# The variable group_list (index = 54676) is removed
# before PCA analysis
ncol(dat)
dat[,ncol(dat)]
# 画图是用数据,因此去掉最后一列grouplist
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)
plot(dat.pca,choix="ind")
# Graph of individuals
fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = dat$group_list, # color by groups
palette = c("#00AFBB", "#E7B800", '#CC00FF', '#FF0099'),
addEllipses = TRUE, # Concentration ellipses # 是否圈起来
legend.title = "Groups"
)
6 eggNOG在线注释蛋白.fa文件
常规的步骤结束了,下一步按理是GO富集和KEGG通路绘图,但是这里是非模式植物,Tree,既没有现成的包,也没有现成的注释文件,没关系,我们自己来。
1 获得非模式种的蛋白.fa文件
这个同参考基因组一样,一样的获取方法,是后续一切注释的基础。
2 使用在线网站对蛋白.fa文件进行GO/KEGG注释
http://eggnog-mapper.embl.de/
选择文件类型,填入个人邮箱,然后提交即可。
但是分析不会开始,进入填的邮箱,会收到一封这样的邮件。
点进去,Click to manage your job
点击Start job,出现下面界面说明开始排队分析了,休息一下吧!
结束后,点击Access your job files here
选择out.emapper.annotations
下一步就是把这一个文件拆分成可以被R包识别的格式。
3 使用python对注释文件进行拆分
首先,电脑要安装python(这里的版本是3.10)
使用VS Code来操作,安装个python插件即可。
##制作背景文件
python.exe -m pip install --upgrade pip
##运行脚本,获取go.tb
##go-basic.obo文件获取地址:http://geneontology.org/docs/download-ontology/#subsets
##打开URL右键保存
python parse_go_obofile.py -i go-basic.obo -o go.tb
pip install requests
##运行脚本,获取KOannotation.tsv,GOannotation.tsv
##out.emapper.annotations文件在eggNOG注释返回邮件中
python parse_eggNOG.py -i out.emapper.annotations -g go.tb -O ath,osa -o D:\work
parse_go_obofile.py
import pandas as pd
import argparse
import re
def main(input_path, output_path):
"""
:param input_path: The go OBO file path
:param output_path: result file path.
:return: None
"""
lev_dict = {'molecular_function': 'MF', 'biological_process': 'BP', 'cellular_component': 'CC'}
with open(input_path, 'r') as f:
cont = True
cont_num = 0
append_dict = {}
dict_list = []
while cont:
cont = f.readline()
cont_num += 1
if cont == "\n":
dict_list.append(append_dict)
append_dict = {}
cont_num = 0
continue
else:
try:
if cont_num == 2:
a = re.search("GO:[0-9]{7}", cont).group()
elif cont_num == 3:
b = re.search("name: .*", cont).group()[6:]
elif cont_num == 4:
c = re.search("namespace: .*", cont).group()[11:]
append_dict = {"GO": a,
"Description": b,
"level": lev_dict.get(c)}
elif re.match("alt_id", cont).group():
append_dict2 = {"GO": re.search("GO:[0-9]{7}", cont).group(),
"Description": b,
"level": lev_dict.get(c)}
dict_list.append(append_dict2)
except AttributeError:
if cont_num == 2:
append_dict.update({"GO": None})
elif cont_num == 3:
append_dict.update({"Description": None})
elif cont_num == 4:
append_dict.update({"level": None})
go_annotation = pd.DataFrame(dict_list)
go_annotation = go_annotation[["GO", "Description", "level"]]
go_annotation.dropna().to_csv(output_path, sep="\t", index=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="This is the script for parsing GO database OBO file to a more \
user-friendly format")
parser.add_argument('-i', '--input_file', required=True,
help='<filepath> The GO database file in OBO format')
parser.add_argument('-o', '--output_file', required=True,
help='<filepath> The result path')
args = parser.parse_args()
main(args.input_file, args.output_file)
parse_eggNOG.py
import pandas as pd
import numpy as np
import argparse
import requests
import re
import os
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0"}
def get_html(url):
resp = requests.get(url, headers=headers)
return resp.text
def own_mapdb(org_list):
"""
Pick map num based on user prefer
:param org_list:organism name list (KEGG abb). eg. ["ath","osa","dosa",.....]
:return: own_maplist: The picked map num: eg. [map0001,map0002,.....,map1111]
"""
if not org_list == ["all"]:
map_set = set()
for org in org_list:
tmp = get_html("http://rest.kegg.jp/list/pathway/"+org)
tmp = list(map(lambda x: x.split("\t"), tmp.split("\n")))
tmp_df = pd.DataFrame(tmp, columns=["pathway", "description"]).dropna()
map_set = map_set.union(set(tmp_df.pathway.apply(lambda x: re.search("[0-9]{5}", str(x)).group())))
map_list = list(map(lambda x: "map"+x, list(map_set)))
return map_list
else:
return None
def get_KEGGmap(own_maplist):
"""
INPUT:own_maplist: The picked map num: eg. [map0001,map0002,.....,map1111]
:return: KO2map_df:columns: [KO,pathway]
KEGGmap_df: columns: [pathway, description]
"""
# KEGG map description
KEGGmap = get_html("http://rest.kegg.jp/list/pathway/")
KEGGmap = list(map(lambda x: x.split("\t"), KEGGmap.split("\n")))
KEGGmap_df = pd.DataFrame(KEGGmap, columns=["pathway", "description"]).dropna()
KEGGmap_df["pathway"] = KEGGmap_df.pathway.apply(lambda x: x.strip("path:"))
# KEGG KO2map
KO2map = get_html("http://rest.kegg.jp/link/pathway/ko")
KO2map = list(map(lambda x: x.split("\t"), KO2map.split("\n")))
KO2map_df = pd.DataFrame(KO2map, columns=["ko", "pathway"]).dropna()
KO2map_df["pathway"] = KO2map_df.pathway.apply(lambda x: x.strip("path:"))
KO2map_df["KO"] = KO2map_df.ko.apply(lambda x: x.strip("ko:"))
if own_maplist:
KEGGmap_df = KEGGmap_df[KEGGmap_df["pathway"].isin(own_maplist)]
KO2map_df = KO2map_df[KO2map_df["pathway"].isin(own_maplist)]
return KO2map_df, KEGGmap_df
def parse_KO_f1(query_line):
"""
:param query_line: a line in eggNOG annotation file which contains a query result
:return: a dict for parse_KO function. eg. {"gene":gene name,"KO": KO}
"""
KO_list = [i for i in map(lambda x: x.lstrip("ko:"), query_line["KEGG_ko"].split(","))]
return {"gene": query_line["Query"], "KO": KO_list}
def parse_KO(df4parse, org_list):
"""
parse the KEGG part in eggNOG annotation file
:param df4parse: the pd.Dataframe object directly comes from the eggNOG annotation file(skip the comment lines, of course)
:return:the parsed KEGG annotation in pd.Dataframe.
"""
KO2map, KEGGmap = get_KEGGmap(own_mapdb(org_list))
gene2KO = df4parse[["Query", "KEGG_ko"]].dropna().apply(parse_KO_f1, axis=1, result_type="expand")
gene2KO = pd.DataFrame({'gene': gene2KO.gene.repeat(gene2KO["KO"].str.len()), 'KO': np.concatenate(gene2KO["KO"].values)})
gene2map = pd.merge(gene2KO, KO2map, on="KO", how="left")
gene2map = pd.merge(gene2map, KEGGmap, on="pathway", how="left")
return gene2map.dropna()
def parse_GO_f1(query_line):
"""
:param query_line: a line in eggNOG annotation file which contains a query result
:return: a dict for parse_KO function. eg. {"gene":gene name,"GO": KO}
"""
GO_list = [i for i in query_line["GOs"].split(",")]
return {"gene": query_line["Query"], "GO": GO_list}
def parse_GO(df4parse, GO_path):
"""
parse the GO part in eggNOG annotation file
:param df4parse: the pd.Dataframe object directly comes from the eggNOG annotation file(skip the comment lines, of course)
:return:the parsed GO annotation in pd.Dataframe.
"""
gene2GO = df4parse[["Query", "GOs"]].dropna().apply(parse_GO_f1, axis=1, result_type="expand")
gene2GO = pd.DataFrame(
{'gene': gene2GO.gene.repeat(gene2GO["GO"].str.len()), 'GO': np.concatenate(gene2GO["GO"].values)})
go_df = pd.read_table(GO_path)
go_df.drop(columns=['Description'], inplace=True)
gene2GO = pd.merge(gene2GO, go_df, on="GO", how="left")
return gene2GO
def check_column(_file):
with open(_file) as f_in:
while True:
line = f_in.readline()
if not line.startswith("#"):
items = line.split('\t')
for _idx in range(len(items)):
if items[_idx].startswith("GO:"):
go_idx = _idx
elif items[_idx].startswith("ko:"):
kegg_idx = _idx
if 'go_idx' in locals() and 'kegg_idx' in locals():
return 0, go_idx, kegg_idx
def main(input_file, out_dir, go_file, org_list):
file4parse = pd.read_csv(input_file,
sep="\t",
comment="#",
usecols=check_column(input_file),
names=['Query', 'GOs', "KEGG_ko"])
file4KO = parse_KO(file4parse, org_list)[["gene", "KO", "pathway", "description"]]
file4GO = parse_GO(file4parse, go_file)
if not os.path.exists(out_dir):
os.mkdir(out_dir)
file4KO.to_csv(os.path.join(out_dir, "KOannotation.tsv"), sep="\t", index=False)
file4GO.to_csv(os.path.join(out_dir, "GOannotation.tsv"), sep="\t", index=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="This is the script for parsing eggNOGv2 annotation file to the file \
which ClusterProfile need")
parser.add_argument('-i', '--input_file', required=True,
help='<filepath> The eggNOG annotation file')
parser.add_argument('-g', '--go_file', required=True,
help='<filepath> The GO database file(come from parse_go_obofile.py)')
parser.add_argument('-O', '--org_list', nargs="?", const=1, default="all",
help='<organism name> The reference organisms in KEGG database(use KEGG abb. please).If not \
specified, it will use all pathway.')
parser.add_argument('-o', '--output_directory', required=True,
help='<dirpath> the directory for results')
args = parser.parse_args()
main(args.input_file, args.output_directory, args.go_file, args.org_list.split(","))
现在有2个脚本,2个原始文件,一步步运行:
生成go.tb
PS D:\work> python parse_go_obofile.py -i go-basic.obo -o go.tb
pip install requests
运行注释脚本:
python parse_eggNOG.py -i out.emapper.annotations -g go.tb -O ath,osa -o D:\work
查看生成文件:
其中,GOannotation.tsv和KOannotation.tsv就是我们需要的可以被包直接读取的注释文件了。
4 获取genelist
##获取差异基因
logFC_t=1
DEG$g=ifelse(DEG$pvalue>0.05,'stable',
ifelse( DEG$log2FoldChange > logFC_t,'UP',
ifelse( DEG$log2FoldChange < -logFC_t,'DOWN','stable') )
)
table(DEG$g)
DOWN stable UP
392 21712 430
head(DEG)
DEG$gene_name <- rownames(DEG)
gene_up= DEG[DEG$g == 'UP','gene_name']
gene_down=DEG[DEG$g == 'DOWN','gene_name']
genedown<-as.matrix(gene_down)
geneup<-as.matrix(gene_up)
save(DEG,gene_up,gene_down,file = 'DEG.Rdata')
##差异基因
log2FC设置为大于等于或小于等于1.5 & padj小于0.05或者0.01的gene
前期准备全部完成,我们进入到最后的GO和KEGG分析部分。
7 GO&KEGG分析(clusterProfiler)
需要三个文件:go.tb、KOannotation.tsv、GOannotation.tsv。
load(file='DEG.Rdata')
library(clusterProfiler)
KOannotation <- read.delim("annotation/KOannotation.tsv", stringsAsFactors=FALSE)
GOannotation <- read.delim("annotation/GOannotation.tsv", stringsAsFactors=FALSE)
GOinfo <- read.delim("annotation/go.tb", stringsAsFactors=FALSE)
DEG$gene_name<-paste(DEG$gene_name,".1",sep="",collapse=NULL, recycle0=FALSE)
##获取gene list
gene_list_up <- DEG[DEG$g == 'UP','gene_name']
gene_list_down <- DEG[DEG$g == 'DOWN','gene_name']
GOannotation = split(GOannotation, with(GOannotation, level))
##GO富集
## 拆分成BP,MF,CC三个数据框
GOannotation = split(GOannotation, with(GOannotation, level))
## 以BP为例
GOenricher_up_BP <- enricher(gene_list_up,
TERM2GENE=GOannotation[['BP']][c(2,1)],
TERM2NAME=GOinfo[1:2])
#KEGG富集
KEGGenricher_up <- enricher(gene_list_up,
TERM2GENE=KOannotation[c(3,1)],
TERM2NAME=KOannotation[c(3,4)])
library(enrichplot)
##气泡图
dotplot(GOenricher_up_BP)
dotplot(KEGGenricher_up)
cnetplot(GOenricher_down_CC)
最后就是根据结果不断微调。下游分析至此告一段落。
8 写在后面
这个就是我整理的非模式种转录组上下游分析的具体代码,其中绘图部分还没有仔细研究,包括最后的气泡图,还需不断精进。
那么希望各位在分析的时候也可以精准的找到出现的问题,拿到预期的实验结果吧。
我们新篇章的学习再见!