R 针对KEGG ORTHOLOGY的Lefse分析

在拿到PICRUST2预测的KO丰度表格后，我们其实接下来也可以像完成优势物种进化分支图一样来针对KO的Lefse分析
首先接上篇，获取KEGG ORTHOLOGY分层注释文件。

Part 1 数据整理

data(KO)
head(KO)
# A tibble: 6 x 9
# KO     Description                                                L1_ID L1         L2_ID L2                      L3_ID L3                     PathwayID
# <chr>  <chr>                                                      <chr> <chr>      <chr> <chr>                   <chr> <chr>                  <chr>    
# 1 K00844 HK; hexokinase [EC:2.7.1.1]                                09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010  
# 2 K12407 GCK; glucokinase [EC:2.7.1.2]                              09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010  
# 3 K00845 glk; glucokinase [EC:2.7.1.2]                              09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010  
# 4 K25026 glk; glucokinase [EC:2.7.1.2]                              09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010  
# 5 K01810 GPI, pgi; glucose-6-phosphate isomerase [EC:5.3.1.9]       09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010  
# 6 K06859 pgi1; glucose-6-phosphate isomerase, archaeal [EC:5.3.1.9] 09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010  

#加载预测的KO丰度表格
raw <- read_tsv("./picrust2/pred_metagenome_unstrat.tsv")
processed <- inner_join(raw,KO,by = c("function" = "KO"))
x <- processed %>% #总结KO L3数据
  group_by(PathwayID) %>%
  summarise(M1=sum(M1),
            M2=sum(M2),
            M3=sum(M3),
            M4=sum(M4),
            M5=sum(M5),
            M6=sum(M6),
            M7=sum(M7),
            M8=sum(M8),
            N1=sum(N1),
            N2=sum(N2),
            N3=sum(N3),
            N4=sum(N4),
            N5=sum(N5),
            N6=sum(N6),
            L1=unique(L1),
            L2=unique(L2),
            L3=unique(L3))
head(x)
# A tibble: 6 x 18
# PathwayID      M1       M2      M3      M4      M5      M6      M7      M8       N1      N2       N3      N4      N5      N6 L1         L2      L3     
# <chr>       <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>    <dbl>   <dbl>    <dbl>   <dbl>   <dbl>   <dbl> <chr>      <chr>   <chr>  
#   1 ko00010   979437. 1157257. 966058. 799234. 977727. 790647. 923608. 905087. 1087292. 886595. 1027478. 980108. 936901. 941357. Metabolism Carboh~ Glycol~
#   2 ko00020   555098.  662764. 574935. 456071  556023. 430408. 498903. 464911.  526917. 451617.  556598. 550912. 500860. 480391. Metabolism Carboh~ Citrat~
#   3 ko00030   593586.  675527. 574631. 471552. 552199. 473592. 575287. 571875.  767123  590887.  696223. 639922. 596782. 630457. Metabolism Carboh~ Pentos~
#   4 ko00040   270708.  312007. 241999. 220993. 236960. 218570. 265125. 260460.  320277. 263980.  297270. 294852. 285620. 264528. Metabolism Carboh~ Pentos~
#   5 ko00051   581103.  685743. 538040. 440673. 548036. 449644. 520336. 522393.  644275. 523412.  606600. 545518. 543689. 557024. Metabolism Carboh~ Fructo~
#   6 ko00052   599459.  686096. 512145. 461007. 560880. 484895. 530024. 588223.  663407. 488498.  568301. 539427. 576209. 545741. Metabolism Carboh~ Galact~

Part 2 Lefse分析

#整理成phyloseq对象
sampleda <- readxl::read_excel("group.xlsx") %>% tibble::column_to_rownames("#SampleID")
featuretab <- x[,c(2:15)] %>% apply(2,as.integer) %>% as.matrix()
rownames(featuretab) <- x$PathwayID
tax <- as.matrix(x[,c(1,16:18)] %>% tibble::column_to_rownames("PathwayID"))
library(phyloseq)
ps <- phyloseq(otu_table(featuretab,taxa_are_rows = T),tax_table(tax),sample_data(sampleda))
# ps1 <- MicrobiotaProcess::as.mpse(ps) %>%
#   mp_rrarefy() %>%
#   mp_cal_abundance(.abundance = RareAbundance,force=T)
# temp <- ps1 %>% mp_extract_assays(.abundance = RelRareAbundanceBySample) %>% as.matrix()
# ps1 <- phyloseq(otu_table(temp,taxa_are_rows = T),tax_table(tax),sample_data(sampleda))
#ps和ps1的分析结果相同
#lefse分析
library(MicrobiotaProcess)
set.seed(1024)
deres <- diff_analysis(obj = ps, #实际利用的是抽平后的相对丰度表
                       classgroup = "Category",
                       firstcomfun = "kruskal.test",
                       # standard_method = "NULL",
                       padjust = "fdr",#p值校正方法
                       filtermod = "pvalue",#以pvalue列过滤
                       firstalpha = 0.05,
                       strictmod = TRUE,#当subgroup被提供时，strictmod决定是否执行all-against-all。
                       secondcomfun = "wilcox.test",
                       secondalpha = 0.01,
                       mlfun = "lda",#线性判别分析，可选随机森林
                       ldascore=3,#线性判别分数
                       type="others" #非物种
)
deres
# The original data: 384 features and 14 samples
# The sample data: 1 variables and 14 samples
# The taxda contained 334 by 3 rank
# after first test (kruskal.test) number of feature (pvalue<=0.05):162
# after second test (wilcox.test and generalizedFC) number of significantly discriminative feature:110
# after lda, Number of discriminative features: 17 (certain taxonomy classification:17; uncertain taxonomy classication: 0)

Part 3 画图

ggdiffclade(obj=deres,
            layout="radial",#布局类型
            alpha=0.3, #树分支的背景透明度
            linewd=0.2, #树中连线粗细
            skpointsize=0.8, #树骨架中国点的大小
            taxlevel=2, #要展示的树分支水平1（L1）及1以下（L2和L3）
            cladetext=3,#文本大小
            setColors=F,#自定义颜色
            removeUnknown=T,#不删分支，但移除分类中有un_的物种注释
            reduce=T)+ # 移除分类不明的物种分支，分支和注释一同删去，为T时removeUnknown参数无效
  scale_fill_manual(values=c("#00AED7", "#FD9347"),guide=guide_legend(order=1)) +
  scale_size_continuous(range = c(0.5, 2), guide = guide_legend(keywidth = 0.2, keyheight = 1, order = 2))+
  guides(color = guide_legend(keywidth = 0.2, keyheight = 1,order = 2,ncol=1)) +
  theme(panel.background=element_rect(fill=NA),
        legend.position="right",
        plot.margin=margin(0,0,0,0),
        legend.spacing.y=unit(0.02, "cm"),
        legend.title=element_text(size=12),
        legend.text=element_text(size=10),
        legend.box.spacing=unit(0.02,"cm"))

ggdiffbox(
  deres, 
  l_xlabtext="relative abundance (%)",
  box_notch=FALSE, 
  colorlist=c("#00AED7", "#FD9347")
)

R 针对KEGG ORTHOLOGY的Lefse分析

Part 1 数据整理

Part 2 Lefse分析

Part 3 画图

推荐阅读更多精彩内容