在拿到PICRUST2预测的KO丰度表格后,我们其实接下来也可以像完成优势物种进化分支图一样来针对KO的Lefse分析
首先接上篇,获取KEGG ORTHOLOGY分层注释文件。
Part 1 数据整理
data(KO)
head(KO)
# A tibble: 6 x 9
# KO Description L1_ID L1 L2_ID L2 L3_ID L3 PathwayID
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 K00844 HK; hexokinase [EC:2.7.1.1] 09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010
# 2 K12407 GCK; glucokinase [EC:2.7.1.2] 09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010
# 3 K00845 glk; glucokinase [EC:2.7.1.2] 09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010
# 4 K25026 glk; glucokinase [EC:2.7.1.2] 09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010
# 5 K01810 GPI, pgi; glucose-6-phosphate isomerase [EC:5.3.1.9] 09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010
# 6 K06859 pgi1; glucose-6-phosphate isomerase, archaeal [EC:5.3.1.9] 09100 Metabolism 09101 Carbohydrate metabolism 00010 Glycolysis / Gluconeo~ ko00010
#加载预测的KO丰度表格
raw <- read_tsv("./picrust2/pred_metagenome_unstrat.tsv")
processed <- inner_join(raw,KO,by = c("function" = "KO"))
x <- processed %>% #总结KO L3数据
group_by(PathwayID) %>%
summarise(M1=sum(M1),
M2=sum(M2),
M3=sum(M3),
M4=sum(M4),
M5=sum(M5),
M6=sum(M6),
M7=sum(M7),
M8=sum(M8),
N1=sum(N1),
N2=sum(N2),
N3=sum(N3),
N4=sum(N4),
N5=sum(N5),
N6=sum(N6),
L1=unique(L1),
L2=unique(L2),
L3=unique(L3))
head(x)
# A tibble: 6 x 18
# PathwayID M1 M2 M3 M4 M5 M6 M7 M8 N1 N2 N3 N4 N5 N6 L1 L2 L3
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
# 1 ko00010 979437. 1157257. 966058. 799234. 977727. 790647. 923608. 905087. 1087292. 886595. 1027478. 980108. 936901. 941357. Metabolism Carboh~ Glycol~
# 2 ko00020 555098. 662764. 574935. 456071 556023. 430408. 498903. 464911. 526917. 451617. 556598. 550912. 500860. 480391. Metabolism Carboh~ Citrat~
# 3 ko00030 593586. 675527. 574631. 471552. 552199. 473592. 575287. 571875. 767123 590887. 696223. 639922. 596782. 630457. Metabolism Carboh~ Pentos~
# 4 ko00040 270708. 312007. 241999. 220993. 236960. 218570. 265125. 260460. 320277. 263980. 297270. 294852. 285620. 264528. Metabolism Carboh~ Pentos~
# 5 ko00051 581103. 685743. 538040. 440673. 548036. 449644. 520336. 522393. 644275. 523412. 606600. 545518. 543689. 557024. Metabolism Carboh~ Fructo~
# 6 ko00052 599459. 686096. 512145. 461007. 560880. 484895. 530024. 588223. 663407. 488498. 568301. 539427. 576209. 545741. Metabolism Carboh~ Galact~
Part 2 Lefse分析
#整理成phyloseq对象
sampleda <- readxl::read_excel("group.xlsx") %>% tibble::column_to_rownames("#SampleID")
featuretab <- x[,c(2:15)] %>% apply(2,as.integer) %>% as.matrix()
rownames(featuretab) <- x$PathwayID
tax <- as.matrix(x[,c(1,16:18)] %>% tibble::column_to_rownames("PathwayID"))
library(phyloseq)
ps <- phyloseq(otu_table(featuretab,taxa_are_rows = T),tax_table(tax),sample_data(sampleda))
# ps1 <- MicrobiotaProcess::as.mpse(ps) %>%
# mp_rrarefy() %>%
# mp_cal_abundance(.abundance = RareAbundance,force=T)
# temp <- ps1 %>% mp_extract_assays(.abundance = RelRareAbundanceBySample) %>% as.matrix()
# ps1 <- phyloseq(otu_table(temp,taxa_are_rows = T),tax_table(tax),sample_data(sampleda))
#ps和ps1的分析结果相同
#lefse分析
library(MicrobiotaProcess)
set.seed(1024)
deres <- diff_analysis(obj = ps, #实际利用的是抽平后的相对丰度表
classgroup = "Category",
firstcomfun = "kruskal.test",
# standard_method = "NULL",
padjust = "fdr",#p值校正方法
filtermod = "pvalue",#以pvalue列过滤
firstalpha = 0.05,
strictmod = TRUE,#当subgroup被提供时,strictmod决定是否执行all-against-all。
secondcomfun = "wilcox.test",
secondalpha = 0.01,
mlfun = "lda",#线性判别分析,可选随机森林
ldascore=3,#线性判别分数
type="others" #非物种
)
deres
# The original data: 384 features and 14 samples
# The sample data: 1 variables and 14 samples
# The taxda contained 334 by 3 rank
# after first test (kruskal.test) number of feature (pvalue<=0.05):162
# after second test (wilcox.test and generalizedFC) number of significantly discriminative feature:110
# after lda, Number of discriminative features: 17 (certain taxonomy classification:17; uncertain taxonomy classication: 0)
Part 3 画图
ggdiffclade(obj=deres,
layout="radial",#布局类型
alpha=0.3, #树分支的背景透明度
linewd=0.2, #树中连线粗细
skpointsize=0.8, #树骨架中国点的大小
taxlevel=2, #要展示的树分支水平1(L1)及1以下(L2和L3)
cladetext=3,#文本大小
setColors=F,#自定义颜色
removeUnknown=T,#不删分支,但移除分类中有un_的物种注释
reduce=T)+ # 移除分类不明的物种分支,分支和注释一同删去,为T时removeUnknown参数无效
scale_fill_manual(values=c("#00AED7", "#FD9347"),guide=guide_legend(order=1)) +
scale_size_continuous(range = c(0.5, 2), guide = guide_legend(keywidth = 0.2, keyheight = 1, order = 2))+
guides(color = guide_legend(keywidth = 0.2, keyheight = 1,order = 2,ncol=1)) +
theme(panel.background=element_rect(fill=NA),
legend.position="right",
plot.margin=margin(0,0,0,0),
legend.spacing.y=unit(0.02, "cm"),
legend.title=element_text(size=12),
legend.text=element_text(size=10),
legend.box.spacing=unit(0.02,"cm"))
ggdiffbox(
deres,
l_xlabtext="relative abundance (%)",
box_notch=FALSE,
colorlist=c("#00AED7", "#FD9347")
)