仅供参考,友善交流
基因组survey
jellyfish统计k-mer频率
singularity exec AssemblyTools.sif jellyfish count -t 32 -C -m 21 -s 20G F_mickiemi.ccs.fastq #一定要解压!
singularity exec AssemblyTools.sif histo -t 10 -o mer_counts.histo mer_counts.jf
singularity exec AssemblyTools.sif genomescope.R -i mer_counts.histo -o gs_out -p 2 -k 21

组装
hifiasm组装
hifiasm -o F_mickiemi.asm -t 32 F_mickiemi.ccs.fastq.gz
其中:
1.-o为输出文件前缀
2.-t表示线程
3.若不需要分型组装,可以添加--primary
产出的结果主要是prefix.bp.p_ctg.gfa
获得组装后的contig水平的基因组
awk '/^S/{print ">"$2;print $3}' F_mickiemi.asm.bp.p_ctg.gfa > F_mickiemi.ctg.fa
初步统计
python3 quast.py F_mickiemi.ctg.fa

###挂载
####使用yahs软件进行挂载
- Hic数据比对到ctg水平基因组
bowtie2-2.4.5-linux-x86_64/bowtie2-build F_mickiemi.ctg.fa F_mickiemi
bowtie2 -x F_mickiemi -p 20 -1 Hic_R1.fastq.gz -2 Hic_R2.fastq.gz --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder | samtools sort -O bam -@ 20 -o - > output.hic.bam
samtools sort -@ 20 -o output.hic.sort.bam output.hic.bam - yahs挂载
yahs F_mickiemi.ctg.fa output.hic.bam
juicer pre -a -o out_JBAT yahs.out.bin yahs.out_scaffolds_final.agp F_mickiemi.ctg.fa.fai >out_JBAT.log 2>&1
cat out_JBAT.log | grep PRE_C_SIZE | awk '{print3}' >chrom.sizes
java -jar -Xmx150G juicer_tools.1.9.9_jcuda.0.8.jar pre out_JBAT.txt out_JBAT.hic.partchrom.sizes
===调挂载
juicer post -o out_JBAT out_JBAT.v1.review.assembly out_JBAT.liftover.agp 00.GY.t2t.hifiasm.fa

###挂载
####重复序列注释
1. repaetmodeler + repeatmasker
参考以前文章
####基因结构注释
1.braker模型训练
rna-seq比对到基因组上
singularity exec GENETools.sif hisat2-build yahs.out_scaffolds_final.fa yahs.out_scaffolds_final.fa
singularity exec GENETools.sif hisat2 --new-summary -p 50 -x yahs.out_scaffolds_final.fa -1 /opt/synData/wangyinjia16/work/Fuck/00.rawdata/mickiemi_good_1.fq.gz -2 /opt/synData/wangyinjia16/work/Fuck/00.rawdata/mickiemi_good_2.fq.gz -S rnaseq.sam 2>hisat.log
samtools sort -o rnaseq.bam rnaseq.sam
转录本拼接
stringtie rnaseq.bam -p 30 -o F_mi.gtf
gffread -w transcript.fa -g ../yahs.out_scaffolds_final.soft.fa F_mi.gtf
运行braker
1.生成key文件
singularity exec GENETools.sif cp /opt/GeneMark-ET/gmes_linux_64/gm_key $HOME/.gm_key
2.配置augustus config文件
singularity exec GENETools.sif cp -r /opt/augustus/config $HOME/augustus_config
3.运行braker
singularity exec GENETools.sif braker.pl --species=F_mi --genome=yahs.out_scaffolds_final.soft.fa --softmasking --bam=rnaseq.bam --cores=40 --workingdir=workdir --useexisting
注:
--softmasking 可提供软屏蔽基因组,可删除
--species 若为非库里物种,会在config添加文件夹,若有可直接指定
--cores 核心数,不提倡超过48
--workingdir 工作目录,若报错,可用chmod 777 添加权限
--useexisting 类似断点续跑,第一次则不用添加
2.maker注释
1.生成配置文件并修改
singularity exec GENETools.sif maker --CTL
注:maker_opts.ctl
genome= 基因组路径
est= 拼接的转录本
protein= 同源物种蛋白序列
model_org= 是否使用Repbase数据库屏蔽重复序列,若用,则为all
rmlib= 若上一个不为all,则添加本身物种的重复序列库
augustus_species= augustus预测的结果
est2genome=1
protein2genome=1
cpus=30
max_dna_len=200000 按100K切割染色体进行预测
min_contig=10000 跳过10K的染色体
min_protein=30 至少需要30个氨基酸
always_complete=1 基因都是完整的,起始密码子开始,终止密码子结束
- 运行maker
singularity exec --bind $HOME/augustus_config:/opt/augustus/config GENETools.sif maker - 处理结果文件
singularity exec GENETools.sif fasta_merge -d scaffold_16_master_datastore_index.log -o Fmi
注:scaffold_16_master_datastore_index.log 运行的log文件
singularity exec GENETools.sif gff3_merge -d scaffold_16_master_datastore_index.log -o Fmi.all.maker.gff3.tmp
cat Fmi.all.maker.gff3.tmp | awk '0}' > Fmi.all.maker.gff3
singularity exec GENETools.sif maker_map_ids --prefix Fmi_ --justify 6 Fmi.all.maker.gff3 > all.id.map
singularity exec GENETools.sif map_gff_ids all.id.map Fmi.all.maker.gff3 - 排序gff3
gff3_sort -g Fmi.all.maker.gff3 -og Fmi.all.maker.sorted.gff3
[pip3 install gff3tool]
3. BUSCO评估
- 蛋白模式 脊椎动物库
singularity exec busco_v5.7.0_cv1.sif busco -m prot -i Fmi.pep.fa -o F_mi_pro_vertebrata_odb10 -l vertebrata_odb10 -c 10 - 基因组模式 脊椎动物库
singularity exec busco_v5.7.0_cv1.sif busco -m genome -i yahs.out_scaffolds_final.soft.fa -o F_mi_genome_vertebrata_odb10 -l vertebrata_odb10 -c 10
####另一版基因结构注释pipeline
1. trimmomatic
java -jar trimmomatic-0.38.jar PE -threads 16 -phred33 -trimlog trim.log mickiemi_good_1.fq.gz mickiemi_good_2.fq.gz -baseout Fmi ILLUMINACLIP:TruSeq3-PE.fa:2:30:10
mv Fmi_1P mickiemi_tri_1.fq
mv Fmi_2P mickiemi_tri_2.fq
2. hisat2-stringtie-transdecoder
hisat2 --new-summary -p30 -x yahs.out_scaffolds_final.fa -1 mickiemi_tri_1.fq -2 mickiemi_tri_2.fq --very-sensitive -S hisat_Fmi.sam
samtools sort -@ 30 -o Fmi_Re.bam hisat_Fmi.sam
~/../106public/software/stringtie-1.3.3b.Linux_x86_64/stringtie Fmi_Re.bam -p 30 -o Fmi.stringtie.gtf
gffread -w transcript.fa -g yahs.out_scaffolds_final.soft.fa ../004.stringtie/Fmi.stringtie.gtf
~/../106public/software/TransDecoder-TransDecoder-v5.7.1/TransDecoder.LongOrfs -t transcript.fa
~/../106public/software/TransDecoder-TransDecoder-v5.7.1/TransDecoder.Predict -t transcript.fa
~/../106public/software/TransDecoder-TransDecoder-v5.7.1/util/gtf_to_alignment_gff3.pl ../004.stringtie/Fmi.stringtie.gtf >Fmi.stringtie.gff3
~/../106public/software/TransDecoder-TransDecoder-v5.7.1/util/cdna_alignment_orf_to_genome_orf.pl transcript.fa.transdecoder.gff3 Fmi.stringtie.gff3 transcript.fa > transcripts.fasta.transdecoder.genome.gff3
3. trinity
singularity exec -B /opt/synData/wangyinjia16/ -e ~/software/trinityrnaseq.v2.15.1.simg Trinity --seqType fq --left mickiemi_tri_1.fq --right mickiemi_tri_2.fq --CPU 20 --max_memory 50G --output Fmi.Re.trinity
4. evigene去除trinity的冗余
将exonerate中的bin添加至环境变量
安装cd-hit
perl evigene/scripts/rnaseq/trformat.pl -input Fmi.Re.trinity.Trinity.fasta -output ID-assembly_1.fa
perl evigene/scripts/prot/tr2aacds.pl -cdna ID-assembly_1.fa
5. PASA
/data/01/user157/software/PASApipeline/Launch_PASA_pipeline.pl -C -R -r --TRANSDECODER -c alignAssembly.config --genome yahs.out_scaffolds_final.soft.fa --transcripts ID-assembly_1.okay.cds --ALIGNERS blat,gmap --CPU 20
~/../106public/software/TransDecoder-TransDecoder-v5.7.1/TransDecoder.LongOrfs -t ../../trinity-pasa/01.non-red/myDB.sqlite.assemblies.fasta
~/../106public/software/TransDecoder-TransDecoder-v5.7.1/TransDecoder.Predict -t ../../trinity-pasa/01.non-red/myDB.sqlite.assemblies.fasta
~/../106public/software/TransDecoder-TransDecoder-v5.7.1/util/cdna_alignment_orf_to_genome_orf.pl myDB.sqlite.assemblies.fasta.transdecoder.gff3 ../../trinity-pasa/01.non-red/myDB.sqlite.pasa_assemblies.gff3 ../../trinity-pasa/01.non-red/myDB.sqlite.assemblies.fasta > non-red-trinity-PASA.transdecoder.genome.gff3
6. augustus
先用braker训练模型
上面有
可以拆分染色体做
singularity exec -B /opt/synData2/anx21 ~/software/GENETools.sif augustus --softmasking=1 --AUGUSTUS_CONFIG_PATH=/home/anx21/augustus_config --species=F_mi final_fa/scaffold_1001-2000.fa --UTR=off > final_fa/scaffold_1001-2000.fa.out && perl ConvertFormat_augustus.pl final_fa/scaffold_1001-2000.fa.out final_fa/scaffold_1001-2000.fa.out.con
7. genescan
perl /data/01/user186/script/genome_annotation/split_fasta.pl yahs.out_scaffolds_final.soft.fa fa-split 5000000
/data/00/software/genscan/genscan /data/00/software/genscan/HumanIso.smat fa-split/scaffold_99.fa | perl /data/01/user186/script/genome_annotation/repetitive/ConvertFormat_genscan.pl - > output/scaffold_99.fa.gff
8. gemoma
用 人 小鼠 大鼠的数据
java -jar -Xmx80g /data/01/user194/bio-software/GeMoMa-1.8/GeMoMa-1.8.jar CLI GeMoMaPipeline threads=40 t=scaffold_2001-2635.fa s=own g=/data/01/user186/00.ref.genome.NCBI/Rat.fa a=/data/01/user186/00.ref.genome.NCBI/Rat.gff outdir=test_1 AnnotationFinalizer.r=NO tblastn=false
java -jar -Xmx80g ~/software/GeMoMa-1.8/GeMoMa-1.8.jar CLI Extractor a=Rat.gff g=Rat.fa Ambiguity=AMBIGUOUS outdir=Rat_run1
mmseqs createdb yahs.out_scaffolds_final.soft.fa Rat_run1/mmseqsdb -v 2
mkdir Rat_run1/ref
mmseqs createdb {out}/ref/mmseqsdb -v 2
mmseqs search Rat_runt/ref/mmseqsdb Rat_runt/mmseqsdb Rat_runt/ref/mmseqsdb_align.out Rat_runt/ref/mmseqsdb_tmp -e 100.0 --threads 20 -s 8.5 -a --comp-bias-corr 0 --max-seqs 500 --mask 0 --orf-start-mode 1 -v 2
mmseqs convertalis Rat_runt/ref/mmseqsdb Rat_runt/mmseqsdb Rat_runt/ref/mmseqsdb_align.out Rat_runt/search.txt --threads 20 --format-output "query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,empty,raw,nident,empty,empty,empty,qframe,tframe,qaln,taln,qlen,tlen" -v 2
java -jar -Xmx80g ~/software/GeMoMa-1.8/GeMoMa-1.8.jar CLI Extractor a=Rat.gff g=Rat.fa Ambiguity=AMBIGUOUS outdir=Rat_run1
makeblastdb -out {target} -title "target" -dbtype nucl
tblastn -query Rat_run1/cds-parts.fasta -db Rat_run1/blastdb -evalue 100.0 -out Rat_run1/search.txt -outfmt "6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen salltitles" -db_gencode 1 -matrix BLOSUM62 -seg no -word_size 3 -comp_based_stats F -gapopen 11 -gapextend 1 -num_threads 40
###参考资料
https://github.com/chhylp123/hifiasm
https://quast.sourceforge.net/
https://github.com/gmarcais/Jellyfish
https://github.com/schatzlab/genomescope
https://github.com/c-zhou/yahs
https://github.com/Gaius-Augustus/BRAKER
https://www.yandell-lab.org/software/maker.html
https://busco.ezlab.org/busco_userguide.html#genome-mode
http://www.usadellab.org/cms/?page=trimmomatic