软件安装
# Instaling OrthoFinder (https://github.com/davidemms/OrthoFinder)
#wget https://github.com/davidemms/OrthoFinder/archive/refs/tags/2.5.5.tar.gz -O ~/software/OrthoFinder-2.5.5.tar.gz
tar zxf ~/software/OrthoFinder-2.5.5.tar.gz -C /opt/biosoft/
echo 'PATH=$PATH:/opt/biosoft/OrthoFinder-2.5.5' >> ~/.bashrc
source ~/.bashrc
# Installing mafft (https://mafft.cbrc.jp/alignment/software/)
tar zxf ~/software/mafft-7.505-without-extensions-src.tgz
cd mafft-7.505-without-extensions/core/
perl -p -i -e 's#PREFIX =.*#PREFIX = /opt/biosoft/mafft#' Makefile
perl -p -i -e 's#BINDIR =.*#BINDIR = /opt/biosoft/mafft/bin/#' Makefile
make -j 4
make install
cd ../../ && rm -rf mafft-7.505-without-extensions
echo 'PATH=$PATH:/opt/biosoft/mafft/bin/' >> ~/.bashrc
source ~/.bashrc
# Installing Gblocks (http://molevol.cmima.csic.es/castresana/Gblocks.html)
#wget http://molevol.cmima.csic.es/castresana/Gblocks/Gblocks_Linux64_0.91b.tar.Z -P ~/software/
tar zxf ~/software/Gblocks_Linux64_0.91b.tar.gz -C /opt/biosoft/
chmod 755 /opt/biosoft/Gblocks_0.91b
echo 'PATH=$PATH:/opt/biosoft/Gblocks_0.91b/' >> ~/.bashrc
source ~/.bashrc
# Installing
#wget http://meta.microbesonline.org/fasttree/FastTree.c -P ~/software/
mkdir /opt/biosoft/FastTree
cd /opt/biosoft/FastTree
cp ~/software/FastTree.c .
gcc -DNO_SSE -O3 -finline-functions -funroll-loops -Wall -o FastTree FastTree.c -lm
gcc -DOPENMP -fopenmp -O3 -finline-functions -funroll-loops -Wall -o FastTreeMP FastTree.c -lm
echo 'PATH=$PATH:/opt/biosoft/FastTree' >> ~/.bashrc
source ~/.bashrc
# Installing RAxML (https://github.com/stamatak/standard-RAxML | https://cme.h-its.org/exelixis/web/software/raxml/index.html)
#wget https://github.com/stamatak/standard-RAxML/archive/v8.2.12.tar.gz -O ~/software/RAxML-v8.2.12.tar.gz
tar zxf ~/software/RAxML-v8.2.12.tar.gz -C /opt/biosoft/
mv /opt/biosoft/standard-RAxML-8.2.12/ /opt/biosoft/RAxML-8.2.12/
cd /opt/biosoft/RAxML-8.2.12/
make -f Makefile.SSE3.PTHREADS.gcc -j 4
rm *.o
make -f Makefile.AVX.PTHREADS.gcc -j 4
rm *.o
source ~/.bashrc.mpich4
make -f Makefile.SSE3.HYBRID.gcc -j 4
rm *.o
make -f Makefile.AVX.HYBRID.gcc -j 4
rm *.o
chmod 755 /opt/biosoft/RAxML-8.2.12/usefulScripts/*
echo 'PATH=$PATH:/opt/biosoft/RAxML-8.2.12/' >> ~/.bashrc
source ~/.bashrc
# Installing FigTree (http://tree.bio.ed.ac.uk/software/figtree/ | https://github.com/rambaut/figtree/releases)
#wget https://github.com/rambaut/figtree/releases/download/v1.4.4/FigTree_v1.4.4.tgz -P ~/software
tar zxf ~/software/FigTree_v1.4.4.tgz -C /opt/biosoft/
# Installing PAML
#wget http://abacus.gene.ucl.ac.uk/software/paml4.9i.tgz -P ~/software/
tar zxf ~/software/paml4.9i.tgz -C /opt/biosoft/
cd /opt/biosoft/paml4.9i/
rm bin/*
cd src
make CC='gcc -fcommon' -f Makefile -j 4
cp baseml basemlg chi2 codeml evolver infinitesites mcmctree pamp yn00 ../bin
echo 'PATH=$PATH:/opt/biosoft/paml4.9i/bin' >> ~/.bashrc
source ~/.bashrc
# Installing CAFE (https://github.com/hahnlab/CAFE5)
#wget https://github.com/hahnlab/CAFE5/releases/download/v5.1/CAFE5-5.1.0.tar.gz -P ~/software
tar zxf ~/software/CAFE5-5.1.0.tar.gz && mv CAFE5 /opt/biosoft/CAFE-5.1.0
cd /opt/biosoft/CAFE-5.1.0
./configure && make -j 4
echo 'PATH=$PATH:/opt/biosoft/CAFE-5.1.0/bin/' >> ~/.bashrc
source ~/.bashrc
# install MCScanX (http://chibba.pgml.uga.edu/mcscan2/)
#wget http://chibba.pgml.uga.edu/mcscan2/MCScanX.zip -P ~/software/
unzip ~/software/MCScanX.zip -d /opt/biosoft/
cd /opt/biosoft/MCScanX/
#make
echo 'PATH=$PATH:/opt/biosoft/MCScanX/' >> ~/.bashrc
source ~/.bashrc
# Installing MUMmer (https://github.com/mummer4/mummer)
#wget https://github.com/mummer4/mummer/releases/download/v4.0.0rc1/mummer-4.0.0rc1.tar.gz -P ~/software
tar zxf ~/software/mummer-4.0.0rc1.tar.gz && cd mummer-4.0.0rc1/
./configure --prefix=/opt/biosoft/mummer-4.0.0rc1 && make -j 4 && make install
cd .. && rm -rf mummer-4.0.0rc1
echo 'PATH=$PATH:/opt/biosoft/mummer-4.0.0rc1/bin/' >> ~/.bashrc
source ~/.bashrc
# Installing Mauve (http://darlinglab.org/mauve/mauve.html)
#wget http://darlinglab.org/mauve/downloads/mauve_linux_2.4.0.tar.gz -P ~/software/
tar zxf ~/software/mauve_linux_2.4.0.tar.gz -C /opt/biosoft/
echo 'PATH=$PATH:/opt/biosoft/mauve_2.4.0/' >> ~/.bashrc
source ~/.bashrc
## a. 准备比较基因分析多物种的FASTA序列文件
mkdir -p /home/train/14.genome_comparison/a.preparing_data
cd /home/train/14.genome_comparison/a.preparing_data
# 1. 通过NCBI的Genome数据库(https://www.ncbi.nlm.nih.gov/genome)查找目标物种的信息
# 手工生成文件source.txt,该文件前4列为:物种缩写名(推荐为属名前2个字母+种名前3个字母)、双名法拉丁文物种名、NCBI的Genome数据库网址、从NCBI下载的数据文件前缀。
# 该文件前第一列用于后续的数据文件名前缀、第二列用于后续将结果中的物种名缩写转换成物种名全称、第三列用于数据的批量化下载。此外,手动编辑文件source.txt时,可以生成更多的列,记录其它基因组信息。
echo -e "parub\tPaxillus rubicundulus\thttps://www.ncbi.nlm.nih.gov/genome/35925
sccit\tScleroderma citrinum\thttps://www.ncbi.nlm.nih.gov/genome/18213
laame\tLaccaria amethystina\thttps://www.ncbi.nlm.nih.gov/genome/17383
plost\tPleurotus ostreatus\thttps://www.ncbi.nlm.nih.gov/genome/909?genome_assembly_id=202364
lasul\tLaetiporus sulphureus\thttps://www.ncbi.nlm.nih.gov/genome/17416
phgig\tPhlebiopsis gigantea\thttps://www.ncbi.nlm.nih.gov/genome/18214" > source.txt
# 2. 批量化下载数据:基因组序列和基因结构注释文件
#perl -e 'while (<>) { chomp; @_ = split /\t/; $curl = `curl $_[2]`; if ($curl =~ m/Download genome annotation in.*href=\"(.*)\">GFF/) { $file = $1; $file =~ s/_genomic.gff.gz//; $prefix = $file; $prefix =~ s/.*\///; print "wget $file\_genomic.gff.gz -O $_[0].genomic.gff.gz\nwget $file\_genomic.fna.gz -O $_[0].genomic.fna.gz\n"; } }' source.txt > command.download.list 2> /dev/null
#ParaFly -c command.download.list -CPU 4
tar zxf /home/train/00.incipient_data/data_for_genome_comparison/genomics_data_from_NCBI.tar.gz
# 3. 对NCBI的GFF文件进行格式修正。默认下NCBI的GFF格式不太正确,且其基因ID编号比较乱。
# 对NCBI的GFF3文件进行整理,仅保留含有mRNA信息的基因,修正尾部的CDS(有些NCBI的注释结果中最后一个CDS是不包含终止密码子的),修正同一条链上存在重叠的基因情况,并对基因id进行重命名。
# 最后,得到所有基因的Protein和CDS序列。若一个基因存在多个可变剪接,则仅保留CDS最长的转录本信息。
for i in `cut -f 1 source.txt`
do
echo "gzip -dc $i.genomic.gff.gz > $i.genome.gff; gzip -dc $i.genomic.fna.gz > $i.genome.fasta; GFF3Clear --coverage 0.3 --gene_code_length 5 --gene_prefix $i --genome $i.genome.fasta $i.genome.gff > $i.geneModels.gff3 2> $i.GFF3Clear.log; gff3ToGtf.pl $i.genome.fasta $i.geneModels.gff3 > $i.geneModels.gtf 2> $i.gff3ToGtf.log; eukaryotic_gene_model_statistics.pl $i.geneModels.gtf $i.genome.fasta $i > $i.statistics"
done > command.geneModels.list 2> $i.GFF3Clear.log; #GFF3Clear --coverage 0.3 --gene_code_length 5 --gene_prefix $i --genome $i.genome.fasta $i.genome.gff > $i.geneModels.gff3 id更名
#该过程可以选择脚本gff3_to_sequences.pl,能够获得pep和nucl
ParaFly -c command.geneModels.list -CPU 6
## b. 使用OrthoFinder进行同源基因聚类分析
mkdir -p /home/train/14.genome_comparison/b.OrthoFinder
cd /home/train/14.genome_comparison/b.OrthoFinder
# 准备输入数据:输入一个文件夹,包含各物种全基因组Proteins序列的Fasta文件。
mkdir -p compliantFasta
cd compliantFasta
for i in `cut -f 1 ../../a.preparing_data/source.txt`
do
echo "perl -p -e 's/>/>$i\|/' ../../a.preparing_data/$i.protein.fasta > $i.fasta"#更改id名称
done | sh
cd ..
mkdir compliantFasta_CDS
cd compliantFasta_CDS
for i in `cut -f 1 ../../a.preparing_data/source.txt`
do
echo "perl -p -e 's/>/>$i\|/' ../../a.preparing_data/$i.CDS.fasta > $i.fasta"
done | sh
cd ..
# 使用OrthoFinder进行直系同源基因分析
cd /home/train/14.genome_comparison/b.OrthoFinder
# 默认设置下输出文件夹在输入文件夹里面,使用-o参数设置输出文件夹路径。使用-op参数设置运行diamond前终止,以手动运行diamond并设置更严格的BLASTP阈值。
orthofinder.py -f compliantFasta -o OrthoFinder -op > command.diamond.list
# 修改diamond参数,手动BLASTP分析并过滤比对结果
perl -p -i -e 'if (m/diamond blastp/ ) { s/$/ --outfmt 5 --max-target-seqs 50 --id 10/; s/--compress 1//; s/.txt/.xml/; s/ -e \S+/ --evalue 1e-5/; s/ -p 1 / -p 4 /; } else { s/^.*\n$//; }' command.diamond.list
ParaFly -c command.diamond.list -CPU 2
perl -e 'while (<>) { print "parsing_blast_result.pl --no-header --max-hit-num 500 --evalue 1e-9 --CIP 0.3 --query-coverage 0.5 --subject-coverage 0.5 $1.xml | gzip -c - > $1.txt.gz\n" if m/(\S+).xml/; }' command.diamond.list > command.parsing_blast_result.list
ParaFly -c command.parsing_blast_result.list -CPU 8
# 继续运行OrthoFinder命令进行直系同源基因分析
OrthoFinderWorkingDir=`head -n 1 command.diamond.list | perl -ne 'print $1 if m/-d (\S+)\//'`
orthofinder.py -b $OrthoFinderWorkingDir -og
# 得到直系同源基因聚类结果groups.txt
orthomcl_mcl2Groups.pl OrthoFinder/*/WorkingDirectory/OrthoFinder/*/Orthogroups/Orthogroups.txt
ln -sf groups.OG.txt groups.txt
# 进行同源基因数量的统计
cat groups.OG.txt groups.PG.txt groups.filtered.txt > groups.All.txt
orthomcl_genes_number_stats.pl groups.All.txt compliantFasta > genes_number_stats.txt
cd ..
#通过共有的基因进行接下来的分析
## c. 使用单拷贝同源基因构建物种树
mkdir -p /home/train/14.genome_comparison/c.species_tree
cd /home/train/14.genome_comparison/c.species_tree
# 1. 根据 orthomcl 结果提取单拷贝同源基因
--out_directory orthologGroups_CDS --species_ratio 1 --single_copy_species_ratio 0.5 --copy_num 10 --max_seq_length 6000 ../b.OrthoFinder/groups.txt ../b.OrthoFinder/compliantFasta_CDS
orthomcl_extract_ortholog_seqs.pl --out_directory orthologGroups_Protein --species_ratio 1 --single_copy_species_ratio 0.5 --copy_num 10 --max_seq_length 6000 --compliantFasta_dir_for_calculating_seq_length ../b.OrthoFinder/compliantFasta_CDS ../b.OrthoFinder/groups.txt ../b.OrthoFinder/compliantFasta
perl -p -i -e 's/\*$//; s/\*/X/g' orthologGroups_Protein/*.fasta
ls orthologGroups_Protein/ | perl -pe 's/.fasta//' > orthologGroups.txt
# 培训班上为了减少计算量,执行以下步骤直接降到1/10的数据量。真实处理数据时请勿运行下一个命令。
perl -e 'while (<>) { $num ++; print if ($num % 10) == 1; }' orthologGroups.txt > aa; mv aa orthologGroups.txt
# 2. 对单拷贝同源进行进行多序列比对
for i in `cat orthologGroups.txt`
do
echo "linsi orthologGroups_Protein/$i.fasta > orthologGroups_Protein/$i.fasta.align"
done > command.mafft.list
ParaFly -c command.mafft.list -CPU 8
# 3. 将Protein序列比对结果转换为Codon序列比对结果
for i in `cat orthologGroups.txt`
do
echo "proteinAlignment2CDSAlignemnt.pl orthologGroups_Protein/$i.fasta.align orthologGroups_CDS/$i.fasta > orthologGroups_CDS/$i.fasta.align"
done > command.proteinAlignment2CDSAlignemnt.list
ParaFly -c command.proteinAlignment2CDSAlignemnt.list -CPU 8
# 4. 对Codon序列比对结果进行保守区块提取
for i in `cat orthologGroups.txt`
do
echo "Gblocks orthologGroups_CDS/$i.fasta.align -t=c -b3=15 -b4=15; if [ -f orthologGroups_CDS/$i.fasta.align-gb ]; then echo $i completed; fi"
echo "Gblocks orthologGroups_Protein/$i.fasta.align -t=p -b3=5 -b4=5; if [ -f orthologGroups_Protein/$i.fasta.align-gb ]; then echo $i completed; fi"
done > command.Gblocks.list
ParaFly -c command.Gblocks.list -CPU 8
# 5. 将各个同源基因家族的多序列比对结果转换为Phylip格式
for i in `cat orthologGroups.txt`
do
perl -p -e 's/(^>\w+).*/$1/; s/ //g' orthologGroups_CDS/$i.fasta.align-gb > orthologGroups_CDS/$i.fasta.align-gb.fasta
done
for i in `cat orthologGroups.txt`
do
echo "/opt/biosoft/RAxML-8.2.12/usefulScripts/convertFasta2Phylip.sh orthologGroups_CDS/$i.fasta.align-gb.fasta > orthologGroups_CDS/$i.phy"
done > command.convertFasta2Phylip.list
ParaFly -c command.convertFasta2Phylip.list -CPU 8
# 6. 整合所有单拷贝同源基因的Codon序列比对结果,并使用ML算法构建物系统发育树
perl -e 'while (<orthologGroups_CDS/*.align-gb>) { open IN, $_ or die $!; while (<IN>) { if (m/^>(\w+)/) { $seq_id = $1; } else { s/\s+//g; $seq{$seq_id} .= $_; } } } foreach (sort keys %seq) { print ">$_\n$seq{$_}\n"; }' > allSingleCopyOrthologsAlign.Codon.fasta
mkdir FastTree
cd FastTree
FastTree -nt -gtr -gamma ../allSingleCopyOrthologsAlign.Codon.fasta > tree_abbr.FastTree #核心参数-gtr -gamma伽马分布 如果用蛋白序列更改参数,并且需要添加ProTest选择最优氨基酸替代模型,再用raml软件进行分析;陈老师推荐用核苷酸序列进行分析
#参考[train@MiWiFi-R3P-srv FastTree]$ les /home/train/00.incipient_data/data_for_genome_comparison/prottest.out
cp tree_abbr.FastTree tree_fullName.FastTree
cut -f 1,2 ../../a.preparing_data/source.txt | perl -p -e 's/\t/\t\"/; s/$/\"/;' | perl -p -e 's#\s+#/#; s#^#perl -p -i -e "s/#; s#\n$#/" tree_fullName.FastTree\n#;' | perl -p -e 's#/\"#/\\\"#; s#\"/#\\\"/#;' | sh
cp tree* ../
cd ..
mkdir RAxML
cd RAxML
/opt/biosoft/RAxML-8.2.12/usefulScripts/convertFasta2Phylip.sh ../allSingleCopyOrthologsAlign.Codon.fasta > allSingleCopyOrthologsAlign.Codon.phy
/opt/biosoft/RAxML-8.2.12/raxmlHPC-PTHREADS-SSE3 -f a -x 12345 -p 12345 -# 100 -m GTRGAMMA -s allSingleCopyOrthologsAlign.Codon.phy -n out_codon -T 8
cp RAxML_bipartitions.out_codon tree_abbr.RAxML
cp tree_abbr.RAxML tree_fullName.RAxML
cut -f 1,2 ../../a.preparing_data/source.txt | perl -p -e 's/\t/\t\"/; s/$/\"/;' | perl -p -e 's#\s+#/#; s#^#perl -p -i -e "s/#; s#\n$#/" tree_fullName.RAxML\n#;' | perl -p -e 's#/\"#/\\\"#; s#\"/#\\\"/#;' | sh
cp tree* ../
[train@MiWiFi-R3P-srv RAxML]$ ll
total 2932
-rw-r--r-- 1 train train 2965047 Jul 25 09:35 allSingleCopyOrthologsAlign.Codon.phy
-rw-r--r-- 1 train train 256 Jul 25 09:36 RAxML_bestTree.out_codon
-rw-r--r-- 1 train train 267 Jul 25 09:36 RAxML_bipartitionsBranchLabels.out_codon
-rw-r--r-- 1 train train 261 Jul 25 09:36 RAxML_bipartitions.out_codon
-rw-r--r-- 1 train train 4500 Jul 25 09:36 RAxML_bootstrap.out_codon #拓扑结构
-rw-r--r-- 1 train train 14229 Jul 25 09:36 RAxML_info.out_codon
###################################################################################
# 真实数据处理时,数据量较大时,推荐按如下步骤使用raxmlHPC-HYBRID-AVX进行并行化计算
# export PKG_CONFIG_PATH=/opt/sysoft/mpich-4.1.2/lib/pkgconfig:$PKG_CONFIG_PATH
# export LD_LIBRARY_PATH=/opt/sysoft/mpich-4.1.2/lib:$LD_LIBRARY_PATH
# export C_INCLUDE_PATH=/opt/sysoft/mpich-4.1.2/include:$C_INCLUDE_PATH
# export PATH=/opt/sysoft/mpich-4.1.2/bin:$PATH
# export LD_PRELOAD=/opt/sysoft/mpich-4.1.2/lib/libmpi.so
# mpirun -np 15 /opt/biosoft/RAxML-8.2.12/raxmlHPC-HYBRID-AVX -f a -x 12345 -p 12345 -# 1000 -m GTRGAMMAX -s allSingleCopyOrthologsAlign.Codon.phy -n out_codon -T 8
#####################################################################################
# FigTree画树,并输出树文件信息
java -jar /opt/biosoft/FigTree_v1.4.4/lib/figtree.jar RAxML_bipartitions.out_codon #根据拓扑结构构建进化树
# parub Agaricomycetidae(伞菌亚纲); Boletales(牛肝菌目)
# sccit Agaricomycetidae(伞菌亚纲); Boletales(牛肝菌目)
# laame Agaricomycetidae(伞菌亚纲); Agaricales(伞菌目)
# plost Agaricomycetidae(伞菌亚纲); Agaricales(伞菌目)
# phgig Agaricomycetes incertae sedis; Polyporales(多孔菌目)
# lasul Agaricomycetes incertae sedis; Polyporales(多孔菌目)
cd ../
#无根树,每个点出发都有三根线,横线距离展现不同物种之间的差异
#图例表示位点替换率
#转换多于颠换
#氨基酸第三个密码子的突变概率更大,密码子的摆动学说
#paml4.9i可以采用穷举法根据树的拓扑结构计算距离
# 7. 可能有部分单拷贝同源基因不利于构建系统发育树,将这些基因挑出来去除掉后,有利于构建更准确的物种树
# 此步骤不是必须的,操作有点复杂,对进化树准确性提高不多。但当RAxML树局部区域拓扑结构不正确,进行修改后,没有枝长数据后,可以考虑使用PAML软件的baseml进行枝长计算。
# 使用PAML软件的baseml对各个单拷贝同源基因进行枝长分析,再与标准树进行比较,剔除异常的基因。
mkdir /home/train/14.genome_comparison/c.species_tree/paml_baseml
cd /home/train/14.genome_comparison/c.species_tree/paml_baseml
# 7.1 根据RAxML的结果提取系统发育树的拓扑结构
echo "6 1" > input.trees
echo "(((parub,sccit),(laame,plost)),(phgig,lasul));" >> input.trees
# 7.2 分别对各个单拷贝同源基因的密码子比对结果用codeml进行系统发育分析,得到每个各个同源基因的树信息
mkdir OG_trees
for i in `cat ../orthologGroups.txt`
do
echo "calculating_branchLength_by_baseml.pl ../orthologGroups_CDS/$i.phy input.trees > OG_trees/$i.tree" #分别对应每棵树进行构建支长
done > command.calculating_branchLength_by_baseml.list
ParaFly -c command.calculating_branchLength_by_baseml.list -CPU 8
# 7.3 整合所有单拷贝同源基因进化树。整合原理:先计算每个树的总枝长,利用其中位数对所有单拷贝同源基因树进行标准化,使每个树的总枝长等于该中位数;再对进化树的每个分支取所有基因的中位数作为其枝长。
get_integrated_tree_from_multiple_trees.pl OG_trees/* > standard.tree
# 7.4 分析每个各个同源基因树枝长和标准树的平均偏差百分比:单拷贝同源基因某分枝的偏差百分比 = (绝对值{单拷贝同源基因树某分>枝的枝长 - 标准树中该分枝的枝长} / 标准树中该分枝的枝长) * 100%;然后计算所有分枝偏差百分比的平均值。
for i in `cat ../orthologGroups.txt`
do
echo "calculating_branchLength_bias_percentage_of_two_trees.pl standard.tree OG_trees/$i.tree > OG_trees/$i.bias"
done > command.calculating_branchLength_bias_percentage_of_two_trees.list
ParaFly -c command.calculating_branchLength_bias_percentage_of_two_trees.list -CPU 8
perl -e 'while (<OG_trees/*.bias>) { my $og = $1 if m/(\w+).bias/; open IN, $_; $_ = <IN>; s/\%//g; print "$og\t$_" if $_; }' > bias.txt
# 剔除和综合树差异较大的单拷贝同源基因。第一轮按左侧99%置信区间保留;第二轮按左侧95%置信区间保留。
remove_extremum_through_normal_distribution.pl --data_column 4 --confidence_interval 5 bias.txt > keep_bias_99.txt
remove_extremum_through_normal_distribution.pl --data_column 4 --confidence_interval 2 keep_bias_99.txt > keep_bias_95.txt
cut -f 1 keep_bias_95.txt > keep_OG.txt
# 7.5 剔除和综合树差异较大的单拷贝同源基因,利用剩下的基因整合出更准确的综合树。
get_integrated_tree_from_multiple_trees.pl --method median `perl -pe 's#^#OG_trees/#; s/\n/.tree /' keep_OG.txt` > tree_abbr.baseml_median
get_integrated_tree_from_multiple_trees.pl --method mean `perl -pe 's#^#OG_trees/#; s/\n/.tree /' keep_OG.txt` > tree_abbr.baseml_mean
cp tree_abbr.baseml_median tree_fullName.baseml_median
cp tree_abbr.baseml_mean tree_fullName.baseml_mean
cut -f 1,2 ../../a.preparing_data/source.txt | perl -p -e 's/\t/\t\"/; s/$/\"/;' | perl -p -e 's#\s+#/#; s#^#perl -p -i -e "s/#; s#\n$#/" tree_fullName.baseml_median\n#;' | perl -p -e 's#/\"#/\\\"#; s#\"/#\\\"/#;' | sh
cut -f 1,2 ../../a.preparing_data/source.txt | perl -p -e 's/\t/\t\"/; s/$/\"/;' | perl -p -e 's#\s+#/#; s#^#perl -p -i -e "s/#; s#\n$#/" tree_fullName.baseml_mean\n#;' | perl -p -e 's#/\"#/\\\"#; s#\"/#\\\"/#;' | sh
cp tree_* ../
cd ../../
# d. 使用PAML软件的mcmctree进行物种分歧时间计算
mkdir -p /home/train/14.genome_comparison/d.divergence_time
cd /home/train/14.genome_comparison/d.divergence_time
# (1) 准备Newick格式的树文件,根据RAxML的结果提取系统发育树的拓扑结构
echo "6 1" > input.trees
echo "(((laame,plost),(parub,sccit)'>0.695<1.313')'>1.403<1.992',(lasul,phgig));" >> input.trees# 校准点 >0.695<1.313以1亿年为单位
#可以在http://timetree.org/上查看物种分歧时间
# Paxillus rubicundulus <=> Scleroderma citrinum 87MYA 69.5-131.3MYA
# Paxillus rubicundulus <=> Laccaria amethystina 145MYA 140.3-199.2MYA
# (2) 准备多序列比对Phylip格式输入文件,可以综合三个位点核酸序列的多序列比对文件
/opt/biosoft/RAxML-8.2.12/usefulScripts/convertFasta2Phylip.sh ../c.species_tree/allSingleCopyOrthologsAlign.Codon.fasta > input.txt
perl -p -i -e 's/\s+/ /' input.txt #/ /中有两个空格
# (3) 准备paml mcmctree配置文件
perl -p -e 's/mtCDNApri123/input/; s/mtCDNApri/input/; s/<1.0/<4.0/; s/^(\s*ndata =) .*/$1 1/; s/usedata = .*/usedata = 3/; s/model = .*/model = 7/; s/alpha = .*/alpha = 0.5/; s/ncatG = .*/ncatG = 5/; s/cleandata = .*/cleandata = 0/; s/burnin = .*/burnin = 800000/; s/sampfreq = .*/sampfreq = 100/; s/nsample = .*/nsample = 20000/;' /opt/biosoft/paml4.9i/examples/DatingSoftBound/mcmctree.ctl > mcmctree.ctl
#修改路径,RootAge = ‘<4.0’
#运行常规数据时,推荐加大取样量和burnin数量。为了让程序运行更快,上面的命令只有1/5的计算量。
#perl -p -e 's/mtCDNApri123/input/; s/mtCDNApri/input/; s/<1.0/<8.0/; s/^(\s*ndata =) .*/$1 1/; s/usedata = .*/usedata = 3/; s/model = .*/model = 7/; s/alpha = .*/alpha = 0.5/; s/ncatG = .*/ncatG = 5/; s/cleandata = .*/cleandata = 0/; s/burnin = .*/burnin = 4000000/; s/sampfreq = .*/sampfreq = 100/; s/nsample = .*/nsample = 100000/;' /opt/biosoft/paml4.9i/examples/DatingSoftBound/mcmctree.ctl > mcmctree.ctl
# (4) 运行PAML软件的mcmctree (若数据量较大,则每当程序调用baseml时,按ctrl + c终止,再手动并行化运行baseml)
mcmctree mcmctree.ctl
cp out.BV in.BV
# (5) 手动并行化运行baseml
#echo "mkdir tmp0001; cd tmp0001; ln -s ../input* ../tmp0001* ./; baseml tmp0001.ctl;
#mkdir tmp0002; cd tmp0002; ln -s ../input* ../tmp0002* ./; baseml tmp0002.ctl;
#mkdir tmp0003; cd tmp0003; ln -s ../input* ../tmp0003* ./; baseml tmp0003.ctl;" > command.baseml.list
#ParaFly -c command.baseml.list -CPU 3
#cat tmp0001/rst2 tmp0002/rst2 tmp0003/rst2 > in.BV
# (6) 使用mcmctree进行approximate likelihood分析#该步骤比较耗时
perl -p -i -e 's/usedata = .*/usedata = 2 \* 0:/;' mcmctree.ctl
mkdir run01 run02 run03 run04 run05
cp input.txt input.trees mcmctree.ctl in.BV run01
cp input.txt input.trees mcmctree.ctl in.BV run02
cp input.txt input.trees mcmctree.ctl in.BV run03
cp input.txt input.trees mcmctree.ctl in.BV run04
cp input.txt input.trees mcmctree.ctl in.BV run05
echo 'cd run01; mcmctree mcmctree.ctl &> mcmctree.log
cd run02; mcmctree mcmctree.ctl &> mcmctree.log
cd run03; mcmctree mcmctree.ctl &> mcmctree.log
cd run04; mcmctree mcmctree.ctl &> mcmctree.log
cd run05; mcmctree mcmctree.ctl &> mcmctree.log' > command.mcmctree.list
ParaFly -c command.mcmctree.list -CPU 5
out.txt是我们所需要的文件
# (7) 比较5次运行的MCMC树,若差异较小,则认可其结果
for i in 01 02 03 04 05
do
perl -n -e 'my $out; while (s/(.*?(\d\.\d+))//) { my $info = $1; my $value = $2; my $new = $value * 100; $info =~ s/$value/$new/; $out .= $info; } $out .= $_; print $out' run$i/FigTree.tre > tree$i.nex
perl -e 'while (<>) { if (s/\s*UTREE.*?=\s*//) { s/\s*\[.*?\]//g; print; } }' tree$i.nex > tree$i.txt
done
echo -e "tree01.txt tree02.txt\ntree01.txt tree03.txt\ntree01.txt tree04.txt\ntree01.txt tree05.txt\ntree02.txt tree03.txt\ntree02.txt tree04.txt\ntree02.txt tree05.txt\ntree03.txt tree04.txt\ntree03.txt tree05.txt\ntree04.txt tree05.txt" > for_compare.txt
perl -pe 's/^/calculating_branchLength_bias_percentage_of_two_trees.pl --no_normalization_of_total_branch_length /; s/$/ >> bias.txt/;' for_compare.txt | sh
perl -e 'while (<>) { @_ = split /\t/, $_; $total += $_[-1]; $num ++; } $mean = $total / $num; print "$mean\%\n";' bias.txt
# 1.755%
rm tree*
perl -n -e 'my $out; while (s/(.*?(\d\.\d+))//) { my $info = $1; my $value = $2; my $new = $value * 100; $info =~ s/$value/$new/; $out .= $info; } $out .= $_; print $out' run01/FigTree.tre > tree_abbr.mcmctree
# (8) 将树信息中的简称换成属名和种名全称
cp tree_abbr.mcmctree tree_fullName.mcmctree
cut -f 1,2 ../a.preparing_data/source.txt | perl -p -e 's/\t/\t\"/; s/$/\"/;' | perl -p -e 's#\s+#/#; s#^#perl -p -i -e "s/#; s#\n$#/" tree_fullName.mcmctree\n#;' | perl -p -e 's#/\"#/\\\"#; s#\"/#\\\"/#;' | sh
java -jar /opt/biosoft/FigTree_v1.4.4/lib/figtree.jar tree_fullName.mcmctree #获得分子钟分析结果
cd ..
## e. 使用CAFE进行基因家族扩张分析
mkdir -p /home/train/14.genome_comparison/e.cafe
cd /home/train/14.genome_comparison/e.cafe
# 根据 orthomcl 结果得到基因家族的数量信息,提取至少在3个species中存在该 gene family 的OGs 。除了用直系同源基因,也可以用pfam作
orthomcl_extract_ortholog_seqs.pl --out_directory orthologGroups_CDS --out_tab_for_CAFE orthomcl2cafe.tab --species_ratio 0.4 --single_copy_species_ratio 0.0 --copy_num 1000 --max_seq_length 100000 ../b.OrthoFinder/groups.txt ../b.OrthoFinder/compliantFasta_CDS
rm orthologGroups_CDS -rf
# 将数据异常的基因家族挑出来。
cafe_filter_abnormal_big_size_gene_family.pl --max_fold_change 10 --max_gene_family_size 100 --out_filter_matrix orthomcl2cafe.bad.tab orthomcl2cafe.tab > orthomcl2cafe.good.tab 2> cafe_filter_abnormal_big_size_gene_family.log
# 准备分歧时间树文件
echo "(((laame:145.3064,plost:145.3064):36.9226,(parub:98.0821,sccit:98.0821):84.1468):11.9203,(lasul:148.3943,phgig:148.3943):45.7551);" > tree.txt
# 根据数据正常的基因家族评估error model和Lamda,并进行基因家族扩张收缩分析
cafe5 -i orthomcl2cafe.good.tab -t tree.txt -e -p -o results_1
# 对数据异常的基因家族进行扩张收缩分析
perl -e '<>; while (<>) { chomp; @_ = split /\t/; shift @_; shift @_; foreach (@_) { $n{$_} = 1; } } @n = sort {$b <=> $a} keys %n; print "maxcnt:$n[0]\n";' orthomcl2cafe.bad.tab > Base_error_model.txt
head -n 4 results_1/Base_error_model.txt | tail -n 3 >> Base_error_model.txt
LAMDA=`perl -e 'while (<>) { print $1 if m/Lambda:\s*(\S+)/; }' results_1/Base_results.txt`
Possion=`perl -e 'while (<>) { print $1 if m/Maximum possible lambda for this topology:\s*(\S+)/; }' results_1/Base_results.txt`
cafe5 -i orthomcl2cafe.bad.tab -t tree.txt -eBase_error_model.txt -l $LAMDA -p$Possion -o results_2 #对所有基因家族进行分析
# 对结果进行整合和统计
parsing_cafe5Out.pl results_1/Base_count.tab,results_2/Base_count.tab results_1/Base_branch_probabilities.tab,results_2/Base_branch_probabilities.tab results_1/Base_asr.tre
cd ..
## f. 使用PAML软件的codeml进行正选择基因分析
mkdir -p /home/train/14.genome_comparison/f.PSG_analysis
cd /home/train/14.genome_comparison/f.PSG_analysis
# (1) 准备同源基因的密码子比对结果Phylip格式文件及其sub tree文件
# 选取至少在4个物种中出现的同源基因
orthomcl_extract_ortholog_seqs.pl --out_directory orthologGroups_CDS --species_ratio 0.6 --single_copy_species_ratio 0.0 --copy_num 1000 --max_seq_length 100000 ../b.OrthoFinder/groups.txt ../b.OrthoFinder/compliantFasta_CDS
orthomcl_extract_ortholog_seqs.pl --out_directory orthologGroups_Protein --species_ratio 0.6 --single_copy_species_ratio 0.0 --copy_num 1000 --max_seq_length 100000 --compliantFasta_dir_for_calculating_seq_length ../b.OrthoFinder/compliantFasta_CDS/ ../b.OrthoFinder/groups.txt ../b.OrthoFinder/compliantFasta
perl -p -i -e 's/\*$//; s/\*/X/g' orthologGroups_Protein/*.fasta #对序列进行处理
ls orthologGroups_Protein/ | perl -pe 's/.fasta//' > orthologGroups.txt
# 对同源基因的蛋白序列进行多序列比对--至少四个物种
# 非同义突变和同义突变
for i in `cat orthologGroups.txt`
do
echo "linsi orthologGroups_Protein/$i.fasta > orthologGroups_Protein/$i.fasta.align"
done > command.mafft.list
#ParaFly -c command.mafft.list -CPU 8
#上一个命令比较消耗计算。而培训班上为了快速进行,运行下一个命令直接得到结果。处理实际数据时需要执行上一个命令,而不是下一个。
rm -rf orthologGroups*; tar zxf /home/train/00.incipient_data/data_for_genome_comparison/PSG_orthologGroups.tar.gz
# 将蛋白序列比对结果转换为密码子序列比对结果
for i in `cat orthologGroups.txt`
do
echo "proteinAlignment2CDSAlignemnt.pl orthologGroups_Protein/$i.fasta.align orthologGroups_CDS/$i.fasta > orthologGroups_CDS/$i.fasta.align"
done > command.proteinAlignment2CDSAlignemnt.list
ParaFly -c command.proteinAlignment2CDSAlignemnt.list -CPU 8
perl -p -i -e 's/^(>\w+).*/$1/' orthologGroups_CDS/*.fasta.align
# 将多序列比对结果转换为Phylip格式
for i in `cat orthologGroups.txt`
do
perl -p -e 's/(^>\w+).*/$1/; s/ //g' orthologGroups_CDS/$i.fasta.align > orthologGroups_CDS/$i.fasta.align.fasta
done
for i in `cat orthologGroups.txt`
do
echo "/opt/biosoft/RAxML-8.2.12/usefulScripts/convertFasta2Phylip.sh orthologGroups_CDS/$i.fasta.align.fasta > orthologGroups_CDS/$i.phy"
done > command.convertFasta2Phylip.list
ParaFly -c command.convertFasta2Phylip.list -CPU 8
rm -rf orthologGroups_CDS/*.fasta*
# (2) 通过对同源基因序列之间两两计算dn/ds方法,筛选候选的正选择基因
echo "(((laame,plost),(parub,sccit)),(lasul,phgig));" > tree.txt
# 使用YN00算法计算dn/ds 非同义突变/同义突变=omega
mkdir dnds_yn00
for i in `cat orthologGroups.txt`
do
echo "calculating_omega_by_yn00.pl --omega_for_PSG 1.0 --input_tree tree.txt orthologGroups_CDS/$i.phy > dnds_yn00/$i.txt 2> dnds_yn00/$i.stats"
done > command.calculating_omega_by_yn00.list
ParaFly -c command.calculating_omega_by_yn00.list -CPU 8
rename PSGyes PSGyes.dnds_yn00 orthologGroups_CDS/*.phy.PSGyes
ls orthologGroups_CDS/*.phy.PSGyes.dnds_yn00 | perl -ne 'print "$1\n" if m/(OG\d+)/' > PSG_yn00.list
# 使用ML方法计算dn/ds
mkdir dnds_ML
for i in `cat orthologGroups.txt`
do
echo "calculating_omega_by_codeml.pl --omega_for_PSG 1.0 orthologGroups_CDS/$i.phy tree.txt > dnds_ML/$i.txt 2> dnds_ML/$i.stats"
done > command.calculating_omega_by_ML.list
#ParaFly -c command.calculating_omega_by_ML.list -CPU 8
#上一个命令比较消耗计算。培训班上可以直接使用cp命令得到结果。
#rename PSGyes PSGyes.dnds_ML orthologGroups_CDS/*.phy.PSGyes
#ls orthologGroups_CDS/*.phy.PSGyes.dnds_ML | perl -ne 'print "$1\n" if m/(OG\d+)/' > PSG_ML.list
cp ~/00.incipient_data/data_for_genome_comparison/PSG_ML.list ./
cat PSG_yn00.list PSG_ML.list | sort | uniq > candidate_PSG.list
#对两种方法进行比较
[train@MiWiFi-R3P-srv f.PSG_analysis]$ diff2file.pl PSG_ML.list PSG_yn00.list
file_common file1_only_num file2_only_num
7 0 2
# rm -rf */dnds_* dnds_ML dnds_yn00
# (3) 使用branch-site model对候选基因进行正选择分析;某个基因在某个分支上是否发生正选择
# 确定待分析的目标分枝
perl -e 'while (<>) { @_ = m/(\w+)/g; } foreach (@_) { print "$_:$_\n"; }' tree.txt > branches.txt
echo "Agaricales:laame,plost
Boletales:parub,sccit
Polyporales:lasul,phgig" >> branches.txt #九个分支
# 使用branch-site model对目标分枝上的候选基因进行正选择分析
# 该分析用的软件/opt/biosoft/paml4.9i/bin/codeml
for x in `cat branches.txt`
do
y=$x
x=${x/:*/}
y=${y/*:/}
mkdir -p BS_$x/
for i in `cat candidate_PSG.list` # 9个基因
do
echo "paml_branch-site_model_analysis.pl --target_branch_species $y orthologGroups_CDS/$i.phy tree.txt BS_$x/$i > BS_$x/$i.p"
done
done > command.paml_branch-site_model.list
ParaFly -c command.paml_branch-site_model.list -CPU 4 #9 x 9=81个命令,很费计算!
# 统计各分枝的正选择基因信息
for x in `cat branches.txt`
do
y=$x
x=${x/:*/}
y=${y/*:/}
cd BS_$x/
perl -e 'while (<*.p>) { my $og = $_; $og =~ s/.*\///; $og =~ s/.p//; open IN, $_; my $p = "NULL\n"; while (<IN>) { chomp; @_ = split /\t/; $p = "$_[0]\t$_[1]\n" if ($_[0] <= 0.05 or ($_[1] eq "BEB_Significance_YES")); } print "$ARGV[0]\t$og\t$p"; }' $x
cd ..
done | grep -v NULL > results.PSG.branches
perl -e 'while (<>) { @_ = split /\s+/; push @stats, $_[0] unless exists $stats{$_[0]}; $stats{$_[0]} ++; } foreach (@stats) { print "$_\t$stats{$_}\n"; }' results.PSG.branches > results.PSG.branches.stats
cd ..
## g. 使用MCScanX进行共线性区块分析
mkdir -p /home/train/14.genome_comparison/g.MCScanX
cd /home/train/14.genome_comparison/g.MCScanX
# 准备2个物种基因组的蛋白质序列文件和GFF文件
ln -s ../a.preparing_data/laame.geneModels.gff3 ./
ln -s ../a.preparing_data/laame.protein.fasta ./
ln -s ../a.preparing_data/laame.genome.fasta ./
ln -s ../a.preparing_data/plost.geneModels.gff3 .
ln -s ../a.preparing_data/plost.protein.fasta .
ln -s ../a.preparing_data/plost.genome.fasta ./
cat laame.protein.fasta plost.protein.fasta > all.fasta
perl -p -i -e 's/\*$//; s/\*/X/g;' all.fasta
diamond makedb --in all.fasta --db all
diamond blastp --db all --query all.fasta --out diamond.out --outfmt 5 --sensitive --max-target-seqs 100 --evalue 1e-5 --id 10 --tmpdir /dev/shm --threads 8
mkdir data
parsing_blast_result.pl --no-header --max-hit-num 100 --evalue 1e-6 --identity 0.5 --subject-coverage 0.5 --query-coverage 0.5 diamond.out > data/input.blast
perl -e 'while (<>) { if (m/^(\S+)\t.*\tgene\t(\d+)\t(\d+).*ID=([^\s;]+)/) { print "$1\t$4\t$2\t$3\n" } }' plost.geneModels.gff3 laame.geneModels.gff3 > data/input.gff
MCScanX data/input
grep -v -P "plost.*plost" data/input.collinearity | grep -P "plost" > data/input.collinearity_interspecific
mcscanx_stats_blocks.pl data/input.collinearity_interspecific > data/input.collinearity_interspecific.stats # 物种间
grep -P "plost.*plost" data/input.collinearity > data/input.collinearity_intraspecific
mcscanx_stats_blocks.pl data/input.collinearity_intraspecific > data/input.collinearity_intraspecific.stats # 物种内
circos_from_MCScanX_out.pl --out-ref-WGD out_WGD --ref-gff3 plost.geneModels.gff3 --ref-fasta plost.genome.fasta --query-gff3 laame.geneModels.gff3 --query-fasta laame.genome.fasta --ref-label PO --query-label LA --min-block-size 5 data/input.collinearity
cd out
circos -conf circos.conf
cd ../out_WGD
circos -conf circos.conf
cd ..
## h. 使用Mummer对两个基因组进行比较(泛基因组常用软件)
mkdir -p /home/train/14.genome_comparison/h.Mummer
cd /home/train/14.genome_comparison/h.Mummer
cp ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta ./
cp ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/idba.fasta ./
para_nucmer --CPU 8 Malassezia_sympodialis.genome_V01.fasta idba.fasta > out.delta
#nucmer -c 200 -g 200 -p out Malassezia_sympodialis.genome_V01.fasta IDBA.fasta
delta-filter -i 95 -l 10000 -r -q out.delta > out.rq_l10k.delta
show-coords -c -d -l -I 95 -L 10000 -r out.rq_l10k.delta > out.show
mummerplot -l --fat -p out --png --large out.rq_l10k.delta
# (1) 修改线条
# set style line 1 lt 1 lw 0.5 pt 3 ps 1
# 以上用于设置线条编号为1的信息:lt 修改线条颜色;lw 修改线条宽度;pt 修改点的形状,7表示使用小圆点,1表示使用加号;ps 修改点的大小。
# plot "out.fplot" title "FWD" w lp ls 1
# 以上是使用编号为 1 的线条对"out.fplot"文件中的数据进行绘制:ls 表示使用的 line style;w lp 即 with linespoints 使用点和线作图。
# 将线条设置较细,由 3 修改为 0.3;将点的大小设置较小,由 1 修改为 0.3;
# (2) 修改文件分辨率
# set terminal png tiny size 800,800
# 将后面的两个数字增大
# (3) 修改字体大小
# set terminal png tiny size 3000,3000 font 'Arial,28'
perl -p -i -e 's/REF/Reference/; s/QRY/Query/; s/lw\s+[\d\.]+/lw 0.3/; s/pt\s+[\d\.]+/pt 1/; s/ps\s+[\d\.]+/ps 0.3/; s/tiny size.*/tiny size 3000,3000 font "Arail,28"/;' out.gp
gnuplot out.gp
cd ..