这里是佳奥！我们继续非模式种转录组的下游分析。

上篇我们拿到了raw count矩阵，但是我们还缺少很多东西，让我们一步一步来。

1、选择合适方法标准化表达矩阵

2、GO/KEGG的注释文件

如果课题组内有完整的流程，就按照课题组内的方法。我是想独立把全流程走一遍，这个部分的篇幅就会很长，那么我们开始吧。

1 R包安装

if (!requireNamespace("BiocManager", quietly = TRUE)) 
install.packages("BiocManager") ##判断是否存在BiocManager包，不存在的话安装
  
library(BiocManager) 
BiocManager::install(c('airway','DESeq2','edgeR','limma'),ask = F,update = F)

2 检查数据

rm(list = ls())
options(stringsAsFactors = F)
a <- read.table('all.counts.txt', header = T)

meta <- a[,1:6]
exprSet <- a[,7:ncol(a)]
colnames(exprSet)

##重命名列名便于后续两两比对
colnames(exprSet) <- c('C1','C2','C3','Y1','Y2','Y3') 

##增加分组信息
group_list <- c('C','C','C','Y','Y','Y')
rownames(exprSet)<-meta$Geneid

##保存矩阵
save(meta,exprSet,group_list,file = 'raw_exprSet.Rdata')
load(file = 'raw_exprSet.Rdata')

##检查样本内相关性和样本间相关性
pheatmap::pheatmap(cor(exprSet))

> dim(exprSet)
[1] 33213    18

#这里初步过滤低表达量基因
exprSet=exprSet[apply(exprSet,1, function(x) sum(x>1) > 6),]
dim(exprSet)

> dim(exprSet)
[1] 25092    18

初步检查了raw count后下一步就是选择合适的标准化方法了。

3 标准化

3.1 标准化之edgeR

##标准化表达矩阵                      
exprSet=log(edgeR::cpm(exprSet)+1)
#exprSet<-round(exprSet) 取整数
#exprSet <- exprSet[!apply(exprSet,1,function(x){sum(floor(x)==0)>1}),] 过滤0数值
#dim(exprSet)
#[1] 18921    18             

> dim(exprSet)
[1] 25092    18
                      
##再挑选top500的MAD值基因
exprSet=exprSet[names(sort(apply(exprSet, 1, mad),decreasing = T)[1:500]),]
dim(exprSet)
M=cor(log2(exprSet+1))
tmp=data.frame(g=group_list)
rownames(tmp)=colnames(M)
pheatmap::pheatmap(M,annotation_col = tmp)
# 现在就可以看到，组内样本很好的聚集在一起
# 组内的样本的相似性是要高于组间
pheatmap::pheatmap(M,annotation_col = tmp)

pheatmap(scale(cor(log2(exprSet+1))))

save(meta,exprSet,group_list,file = 'edgeRclean_exprSet.Rdata')               
#以上代码，就是为了检查数据集exprSet里面的表达矩阵和表型信息

3.2 标准化之DESeq2

DESeq2包内置归一化算法，因此输入的矩阵必须为raw count。FPKM count和log计算后的矩阵都无法导入DESeq2包进行后续的差异分析。

3.3 标准化之FPKM（TPM、CPM）

这个是实验室内最多使用的标准化方法吧。这里是使用GFT文件的注释信息来计算FPKM矩阵。

library(tidyverse)

#读gtf文件，计算所有外显子的长度
gtf <- read_tsv("hg38.gtf", comment="#", col_names=c('chr','source','type','start','end','score','strand','phase','attributes')) %>% filter(type=='exon') %>% mutate(len = end - start + 1) %>% select(start, end, attributes,len)

#计算基因的非冗余外显子的长度，即获得有效基因长度
gtf$attributes %>% str_extract(., "gene_id \"[\\w|\\.]+") %>% str_remove(., "gene_id \"") -> gtf$gene_id
gtf %>% select(start, end, gene_id, len) %>% distinct(start,end,gene_id, .keep_all = T) %>% select(gene_id,len) %>% group_by(gene_id) %>% summarise(est_len=sum(len)) -> gtf

#整理数据
a <- read.table('all.counts.txt', header = T)
colnames(gtf) <- c('Geneid','est_len')

#读取count的表达量矩阵，列名为gene_id和count，其中gene_id是和gtf文件一致的，然后和刚才计算得到的有效基因长度合并
expmat <- read.table(file = 'all.counts.txt',header = T) %>% inner_join(gtf , by = 'Geneid' ) %>% drop_na() 

#各种转换的方法
countToTpm <- function(counts, effLen)
{
    rate <- log(counts) - log(effLen)
    denom <- log(sum(exp(rate)))
    exp(rate - denom + log(1e6))
}
 
countToFpkm <- function(counts, effLen)
{
    N <- sum(counts)
    exp( log(counts) + log(1e9) - log(effLen) - log(N) )
}
 
fpkmToTpm <- function(fpkm)
{
    exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}
 
countToCPM <- function( counts)
{
    N <- sum(counts)
    exp( log(counts) + log(1e6) - log(N) )
}

colnames(expmat) <- c("Geneid","Chr","Start","End","Strand","Length","count","est_len")

expmat %>% 
    mutate( FPKM = countToFpkm(.$count, .$est_len) ) %>% #转FPKM
    mutate( TPM = countToTpm(.$count, .$est_len) )  %>% #转TPM
    mutate( CPM = countToCPM(.$count) ) %>% #转CPM
   select(-est_len) %>% write_tsv("out.xls") #输出结果

一次计算一列数值，多次算完在excel内合并即可。

4 DEG差异分析（DESeq2）+火山图绘制

rm(list = ls())
options(stringsAsFactors = F)
load(file = 'raw_exprSet.Rdata')
group_list

##表达矩阵去除0值
newexprSet <- exprSet[!apply(exprSet,1,function(x){sum(floor(x)==0)>1}),]

##查看数据分布
hist(colSums(newexprSet))

##归一化表达矩阵
NM_exprSet <- t(scale(t(newexprSet)))

##差异分析

##DESeq_DEG 表达矩阵需要raw count值，因此newexprSet

suppressMessages(library(DESeq2)) 
(colData <- data.frame(row.names=colnames(newexprSet), group_list=group_list) )
dds <- DESeqDataSetFromMatrix(countData = newexprSet,
                              colData = colData,
                              design = ~ group_list)
dds <- estimateSizeFactors(dds) ##计算归一化
dds <- DESeq(dds)

res <- results(dds, 
               contrast=c("group_list","C","Y"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
DEG=as.data.frame(resOrdered)

##去除na
DEG <- na.omit(DEG)

nrDEG=DEG
DEseq_DEG=nrDEG

##绘制热图
library(pheatmap)
choose_gene=head(rownames(DEG),100) ## 50 maybe better
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix)

##绘制火山图
logFC_cutoff <- with(DEG,mean(abs( log2FoldChange)) + 2*sd(abs( log2FoldChange)) )

# logFC_cutoff=1
DEG$change = as.factor(ifelse(DEG$pvalue < 0.05 & abs(DEG$log2FoldChange) > logFC_cutoff,
                                   ifelse(DEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'NOT')
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                    '\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
                    '\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)

library(ggplot2)
g = ggplot(data=DEG, 
           aes(x=log2FoldChange, y=-log10(pvalue), 
               color=change)) +
  geom_point(alpha=0.4, size=1.75) +
  theme_set(theme_set(theme_bw(base_size=20)))+
  xlab("log2 fold change") + ylab("-log10 p-value") +
  ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
  scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
ggsave(g,filename = paste0(n,'_volcano.png'))
}

##Dispersion plot
plotDispEsts(dds, main="Dispersion plot")

##
png("DESeq2_RAWvsNORM.png",height = 800,width = 800)
par(cex = 0.7)
n.sample=ncol(exprSet)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
par(mfrow=c(2,2))
boxplot(exprSet, col = cols,main="expression value",las=2)
boxplot(exprMatrix_rlog, col = cols,main="expression value",las=2)
hist(as.matrix(exprSet))
hist(exprMatrix_rlog)

5 PCA图绘制

rm(list = ls())
options(stringsAsFactors = F)
load(file = 'clean_exprSet.Rdata')

##PCA 每次都要检测数据
dat=exprSet
dat[1:4,1:4]
dim(dat)

##step1:准备数据

##下面是画PCA的必须操作，需要看说明书。
dat=t(dat)
dat=as.data.frame(dat)
dat=cbind(dat,group_list)
# dat$group_list=group_list


##step2:画图
library("FactoMineR")
library("factoextra") 
# The variable group_list (index = 54676) is removed
# before PCA analysis
ncol(dat)
dat[,ncol(dat)]
# 画图是用数据，因此去掉最后一列grouplist
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)
plot(dat.pca,choix="ind")
# Graph of individuals
fviz_pca_ind(dat.pca,
             geom.ind = "point", # show points only (nbut not "text")
             col.ind = dat$group_list, # color by groups
             palette = c("#00AFBB", "#E7B800", '#CC00FF', '#FF0099'),
             addEllipses = TRUE, # Concentration ellipses # 是否圈起来
             legend.title = "Groups"
)

6 eggNOG在线注释蛋白.fa文件

常规的步骤结束了，下一步按理是GO富集和KEGG通路绘图，但是这里是非模式植物，Tree，既没有现成的包，也没有现成的注释文件，没关系，我们自己来。

1 获得非模式种的蛋白.fa文件

这个同参考基因组一样，一样的获取方法，是后续一切注释的基础。

2 使用在线网站对蛋白.fa文件进行GO/KEGG注释

http://eggnog-mapper.embl.de/

QQ截图20221226171802.png

选择文件类型，填入个人邮箱，然后提交即可。

QQ截图20221226171919.png

但是分析不会开始，进入填的邮箱，会收到一封这样的邮件。

QQ截图20221226172217.png

点进去，Click to manage your job

QQ截图20221226172354.png

点击Start job，出现下面界面说明开始排队分析了，休息一下吧！

QQ截图20221226172416.png

QQ截图20221226172826.png

结束后，点击Access your job files here

QQ截图20221226194646.png

选择out.emapper.annotations

QQ截图20221226194745.png

下一步就是把这一个文件拆分成可以被R包识别的格式。

3 使用python对注释文件进行拆分

首先，电脑要安装python（这里的版本是3.10）

QQ截图20221226195023.png

使用VS Code来操作，安装个python插件即可。

##制作背景文件

python.exe -m pip install --upgrade pip

##运行脚本，获取go.tb 
##go-basic.obo文件获取地址：http://geneontology.org/docs/download-ontology/#subsets
##打开URL右键保存
python parse_go_obofile.py -i go-basic.obo -o go.tb  

pip install requests

##运行脚本，获取KOannotation.tsv，GOannotation.tsv
##out.emapper.annotations文件在eggNOG注释返回邮件中
python parse_eggNOG.py -i out.emapper.annotations -g go.tb -O ath,osa -o D:\work

parse_go_obofile.py

import pandas as pd
import argparse
import re


def main(input_path, output_path):
    """
    :param input_path: The go OBO file path
    :param output_path: result file path.
    :return: None
    """
    lev_dict = {'molecular_function': 'MF', 'biological_process': 'BP', 'cellular_component': 'CC'}
    with open(input_path, 'r') as f:
        cont = True
        cont_num = 0
        append_dict = {}
        dict_list = []
        while cont:
            cont = f.readline()
            cont_num += 1
            if cont == "\n":
                dict_list.append(append_dict)
                append_dict = {}
                cont_num = 0
                continue
            else:
                try:
                    if cont_num == 2:
                        a = re.search("GO:[0-9]{7}", cont).group()
                    elif cont_num == 3:
                        b = re.search("name: .*", cont).group()[6:]
                    elif cont_num == 4:
                        c = re.search("namespace: .*", cont).group()[11:]
                        append_dict = {"GO": a,
                                       "Description": b,
                                       "level": lev_dict.get(c)}
                    elif re.match("alt_id", cont).group():
                        append_dict2 = {"GO": re.search("GO:[0-9]{7}", cont).group(),
                                        "Description": b,
                                        "level": lev_dict.get(c)}
                        dict_list.append(append_dict2)
                except AttributeError:
                    if cont_num == 2:
                        append_dict.update({"GO": None})
                    elif cont_num == 3:
                        append_dict.update({"Description": None})
                    elif cont_num == 4:
                        append_dict.update({"level": None})
        go_annotation = pd.DataFrame(dict_list)
        go_annotation = go_annotation[["GO", "Description", "level"]]
        go_annotation.dropna().to_csv(output_path, sep="\t", index=False)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="This is the script for parsing GO database OBO file to a more \
                                                 user-friendly format")
    parser.add_argument('-i', '--input_file', required=True,
                        help='<filepath>  The GO database file in OBO format')
    parser.add_argument('-o', '--output_file', required=True,
                        help='<filepath>  The result path')
    args = parser.parse_args()
    main(args.input_file, args.output_file)

parse_eggNOG.py

import pandas as pd
import numpy as np
import argparse
import requests
import re
import os

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0"}


def get_html(url):
    resp = requests.get(url, headers=headers)
    return resp.text


def own_mapdb(org_list):
    """
    Pick map num based on user prefer
    :param org_list:organism name list (KEGG abb). eg. ["ath","osa","dosa",.....]
    :return: own_maplist: The picked map num: eg. [map0001,map0002,.....,map1111]
    """
    if not org_list == ["all"]:
        map_set = set()
        for org in org_list:
            tmp = get_html("http://rest.kegg.jp/list/pathway/"+org)
            tmp = list(map(lambda x: x.split("\t"), tmp.split("\n")))
            tmp_df = pd.DataFrame(tmp, columns=["pathway", "description"]).dropna()
            map_set = map_set.union(set(tmp_df.pathway.apply(lambda x: re.search("[0-9]{5}", str(x)).group())))
        map_list = list(map(lambda x: "map"+x, list(map_set)))
        return map_list
    else:
        return None


def get_KEGGmap(own_maplist):
    """
    INPUT:own_maplist: The picked map num: eg. [map0001,map0002,.....,map1111]
    :return: KO2map_df:columns: [KO,pathway]
             KEGGmap_df: columns: [pathway, description]
    """
    # KEGG map description
    KEGGmap = get_html("http://rest.kegg.jp/list/pathway/")
    KEGGmap = list(map(lambda x: x.split("\t"), KEGGmap.split("\n")))
    KEGGmap_df = pd.DataFrame(KEGGmap, columns=["pathway", "description"]).dropna()
    KEGGmap_df["pathway"] = KEGGmap_df.pathway.apply(lambda x: x.strip("path:"))
    # KEGG KO2map
    KO2map = get_html("http://rest.kegg.jp/link/pathway/ko")
    KO2map = list(map(lambda x: x.split("\t"), KO2map.split("\n")))
    KO2map_df = pd.DataFrame(KO2map, columns=["ko", "pathway"]).dropna()
    KO2map_df["pathway"] = KO2map_df.pathway.apply(lambda x: x.strip("path:"))
    KO2map_df["KO"] = KO2map_df.ko.apply(lambda x: x.strip("ko:"))
    if own_maplist:
        KEGGmap_df = KEGGmap_df[KEGGmap_df["pathway"].isin(own_maplist)]
        KO2map_df = KO2map_df[KO2map_df["pathway"].isin(own_maplist)]
    return KO2map_df, KEGGmap_df


def parse_KO_f1(query_line):
    """
    :param query_line: a line in eggNOG annotation file which contains a query result
    :return:  a dict for parse_KO function. eg. {"gene":gene name,"KO": KO}
    """
    KO_list = [i for i in map(lambda x: x.lstrip("ko:"), query_line["KEGG_ko"].split(","))]
    return {"gene": query_line["Query"], "KO": KO_list}


def parse_KO(df4parse, org_list):
    """
    parse the KEGG part in eggNOG annotation file
    :param df4parse: the pd.Dataframe object directly comes from the eggNOG annotation file(skip the comment lines, of course)
    :return:the parsed KEGG annotation in pd.Dataframe.
    """
    KO2map, KEGGmap = get_KEGGmap(own_mapdb(org_list))
    gene2KO = df4parse[["Query", "KEGG_ko"]].dropna().apply(parse_KO_f1, axis=1, result_type="expand")
    gene2KO = pd.DataFrame({'gene': gene2KO.gene.repeat(gene2KO["KO"].str.len()), 'KO': np.concatenate(gene2KO["KO"].values)})
    gene2map = pd.merge(gene2KO, KO2map, on="KO", how="left")
    gene2map = pd.merge(gene2map, KEGGmap, on="pathway", how="left")
    return gene2map.dropna()


def parse_GO_f1(query_line):
    """
    :param query_line: a line in eggNOG annotation file which contains a query result
    :return:  a dict for parse_KO function. eg. {"gene":gene name,"GO": KO}
    """
    GO_list = [i for i in query_line["GOs"].split(",")]
    return {"gene": query_line["Query"], "GO": GO_list}


def parse_GO(df4parse, GO_path):
    """
    parse the GO part in eggNOG annotation file
    :param df4parse: the pd.Dataframe object directly comes from the eggNOG annotation file(skip the comment lines, of course)
    :return:the parsed GO annotation in pd.Dataframe.
    """
    gene2GO = df4parse[["Query", "GOs"]].dropna().apply(parse_GO_f1, axis=1, result_type="expand")
    gene2GO = pd.DataFrame(
        {'gene': gene2GO.gene.repeat(gene2GO["GO"].str.len()), 'GO': np.concatenate(gene2GO["GO"].values)})
    go_df = pd.read_table(GO_path)
    go_df.drop(columns=['Description'], inplace=True)
    gene2GO = pd.merge(gene2GO, go_df, on="GO", how="left")
    return gene2GO


def check_column(_file):
    with open(_file) as f_in:
        while True:
            line = f_in.readline()
            if not line.startswith("#"):
                items = line.split('\t')
                for _idx in range(len(items)):
                    if items[_idx].startswith("GO:"):
                        go_idx = _idx
                    elif items[_idx].startswith("ko:"):
                        kegg_idx = _idx
            if 'go_idx' in locals() and 'kegg_idx' in locals():
                return 0, go_idx, kegg_idx


def main(input_file, out_dir, go_file, org_list):
    file4parse = pd.read_csv(input_file,
                             sep="\t",
                             comment="#",
                             usecols=check_column(input_file),
                             names=['Query', 'GOs', "KEGG_ko"])
    file4KO = parse_KO(file4parse, org_list)[["gene", "KO", "pathway", "description"]]
    file4GO = parse_GO(file4parse, go_file)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    file4KO.to_csv(os.path.join(out_dir, "KOannotation.tsv"), sep="\t", index=False)
    file4GO.to_csv(os.path.join(out_dir, "GOannotation.tsv"), sep="\t", index=False)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="This is the script for parsing eggNOGv2 annotation file to the file \
                                                  which ClusterProfile need")
    parser.add_argument('-i', '--input_file', required=True,
                        help='<filepath>  The eggNOG annotation file')
    parser.add_argument('-g', '--go_file', required=True,
                        help='<filepath>  The GO database file(come from parse_go_obofile.py)')
    parser.add_argument('-O', '--org_list', nargs="?", const=1, default="all",
                        help='<organism name>  The reference organisms in KEGG database(use KEGG abb. please).If not \
                        specified, it will use all pathway.')
    parser.add_argument('-o', '--output_directory', required=True,
                        help='<dirpath>  the directory for results')
    args = parser.parse_args()
    main(args.input_file, args.output_directory, args.go_file, args.org_list.split(","))

现在有2个脚本，2个原始文件，一步步运行：

QQ截图20221226200519.png

生成go.tb

PS D:\work> python parse_go_obofile.py -i go-basic.obo -o go.tb

pip install requests

运行注释脚本：

python parse_eggNOG.py -i out.emapper.annotations -g go.tb -O ath,osa -o D:\work

查看生成文件：

QQ截图20221226200915.png

其中，GOannotation.tsv和KOannotation.tsv就是我们需要的可以被包直接读取的注释文件了。

4 获取genelist

##获取差异基因
logFC_t=1
DEG$g=ifelse(DEG$pvalue>0.05,'stable',
             ifelse( DEG$log2FoldChange > logFC_t,'UP',
                     ifelse( DEG$log2FoldChange < -logFC_t,'DOWN','stable') )
)
table(DEG$g)
  DOWN stable     UP 
   392  21712    430 
head(DEG)

DEG$gene_name <- rownames(DEG)
gene_up= DEG[DEG$g == 'UP','gene_name'] 
gene_down=DEG[DEG$g == 'DOWN','gene_name'] 

genedown<-as.matrix(gene_down)
geneup<-as.matrix(gene_up)

save(DEG,gene_up,gene_down,file = 'DEG.Rdata')

##差异基因
log2FC设置为大于等于或小于等于1.5 & padj小于0.05或者0.01的gene

前期准备全部完成，我们进入到最后的GO和KEGG分析部分。

7 GO&KEGG分析（clusterProfiler）

需要三个文件：go.tb、KOannotation.tsv、GOannotation.tsv。

load(file='DEG.Rdata')
library(clusterProfiler)
KOannotation <- read.delim("annotation/KOannotation.tsv", stringsAsFactors=FALSE)
GOannotation <- read.delim("annotation/GOannotation.tsv", stringsAsFactors=FALSE)
GOinfo <- read.delim("annotation/go.tb", stringsAsFactors=FALSE)
DEG$gene_name<-paste(DEG$gene_name,".1",sep="",collapse=NULL, recycle0=FALSE)

##获取gene list
gene_list_up <- DEG[DEG$g == 'UP','gene_name'] 
gene_list_down <- DEG[DEG$g == 'DOWN','gene_name']
GOannotation = split(GOannotation, with(GOannotation, level))

##GO富集
## 拆分成BP，MF，CC三个数据框
GOannotation = split(GOannotation, with(GOannotation, level))
## 以BP为例
GOenricher_up_BP <- enricher(gene_list_up,
                        TERM2GENE=GOannotation[['BP']][c(2,1)],
                        TERM2NAME=GOinfo[1:2])

#KEGG富集
KEGGenricher_up <- enricher(gene_list_up,
                        TERM2GENE=KOannotation[c(3,1)],
                        TERM2NAME=KOannotation[c(3,4)])

library(enrichplot)

##气泡图
dotplot(GOenricher_up_BP)
dotplot(KEGGenricher_up)

cnetplot(GOenricher_down_CC)

最后就是根据结果不断微调。下游分析至此告一段落。

8 写在后面

这个就是我整理的非模式种转录组上下游分析的具体代码，其中绘图部分还没有仔细研究，包括最后的气泡图，还需不断精进。

那么希望各位在分析的时候也可以精准的找到出现的问题，拿到预期的实验结果吧。

我们新篇章的学习再见！

【非模式种转录组】二、下游分析R语言篇