sample1
1.统计reads数
samtools view m64082_210415_203420.subreads.1--1.bam| wc -l
16394742 #sample1 reads number
2.统计bases数
samtools view m64082_210415_203420.subreads.1--1.bam| cut -f10 |wc -c
51688834095
51688834095-16394742=51672439353 #sample1 总bases数
3.统计reads长度分布
samtools view m64082_210415_203420.subreads.1--1.bam | cut -f10 | awk '{print length($1);}' | sort -n -r > sample1.subreads.length
4.计算N50
samtools fasta m64082_210415_203420.subreads.1--1.bam >m64082_210415_203420.subreads.1--1.fasta
perl ~/soft/N50_N90.pl m64082_210415_203420.subreads.1--1.fasta >sample1.subreads.N50 &
N50: 3465
setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/subreads")
a<-read.table(file = "sample1.subreads.length")
a$V1<-as.integer(a$V1)
library(ggplot2)
ggplot(data=a,aes(x=V1)) +
geom_histogram(binwidth=200,fill="#6e6dcd", color="#e9ecef", alpha=0.9)+
theme_bw()+ scale_x_continuous(name="Read Length",limits=c(0, 20000))+
scale_y_continuous(name="Reads",limits=c(0, 1500000))
ggsave("sample1.subreads.length.tiff",height = 4,width = 8)
nohup /public/jychu/soft/smrtlink/smrtcmds/bin/ccs --min-passes 2 --min-length 50 --max-length 15000 --min-rq 0.9 --num-threads 20 m64082_210415_203420.subreads.1--1.bam sample0.ccs.bam &
1.统计reads数
samtools view sample1.ccs.bam | wc -l
628188 #reads number
2.统计bases数
samtools view sample1.ccs.bam | cut -f10 |wc -c
2205667668
2205667668-628188=2205039480 #all bases
3.统计序列长度、FP、质量值
samtools view sample1.ccs.bam | cut -f10 | awk '{print length($1);}' | sort -n -r > sample1.ccs.length
samtools view sample1.ccs.bam |cut -f16 | tr ":" "\t" | cut -f3 > sample1.Fullpasses
samtools view sample1.ccs.bam |cut -f17 | tr ":" "\t" | cut -f3 > sample1.Quality
cat sample1.Quality| awk '{sum+=$1} END {print "Avg =", sum/NR}'
Avg = 0.995584
cat sample1.Fullpasses| awk '{sum+=$1} END {print "Avg =", sum/NR}'
Avg = 22.9421
setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/ccs")
a<-read.table(file = "sample1.ccs.length")
a$V1<-as.integer(a$V1)
library(ggplot2)
ggplot(data=a,aes(x=V1)) +
geom_histogram(binwidth=200,fill="#6498C8", color="#e9ecef", alpha=0.9)+
theme_bw()+ scale_x_continuous(name="Read Length",limits=c(0, 15000))+
scale_y_continuous(name="Reads",limits=c(0, 50000))
ggsave("sample1.ccs.length.tiff",height = 4,width = 8)
rm(list=ls())
setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/ccs")
a<-read.table(file = "sample1.Quality")
library(ggplot2)
ggplot(data=a,aes(x=V1)) +
geom_histogram(binwidth=0.0008,fill="#87C966", color="#e9ecef", alpha=0.9)+
theme_bw()+ scale_x_continuous(name="Quality",limits=c(0.9,1))+
scale_y_continuous(name="Reads",limits=c(0,30000))
ggsave("sample1.ccs.quality.tiff",height = 4,width = 8)
rm(list=ls())
setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/ccs")
a<-read.table(file = "sample1.Fullpasses")
a$V1<-as.integer(a$V1)
library(ggplot2)
ggplot(data=a,aes(x=V1)) +
geom_histogram(binwidth=2,fill="#F4AB58", color="#e9ecef", alpha=0.9)+
theme_bw()+ scale_x_continuous(name="Number of Passes",limits=c(0, 200))+
scale_y_continuous(name="Reads",limits=c(0,60000))
ggsave("sample1.ccs.Fullpasses.tiff",height = 4,width = 8)
1.统计含有5p引物序列数
vi 5p.fasta
>primer_5p
GCAATGAAGTCGCAGGGTTGGG
/public/jychu/soft/smrtlink/smrtcmds/bin/lima --peek-guess -j 20 sample1.ccs.bam 5p.fasta sample1.5p.bam
samtools view sample1.5p.bam| wc -l
563560 #Number of five-five prime reads
rm -f sample1.5p.*
2.统计含有3p引物序列数
vi 3p.fasta
>primer_3p
AAGCAGTGGTATCAACGCAGAGTAC
/public/jychu/soft/smrtlink/smrtcmds/bin/lima --peek-guess -j 20 sample1.ccs.bam 3p.fasta sample1.3p.bam
samtools view sample1.3p.bam| wc -l
615510 #Number of three-three prime reads
rm -f sample1.3p.*
3.统计同时含有3p和5p的序列数
vi IsoSeqPrimers.fasta
>primer_5p
GCAATGAAGTCGCAGGGTTGGG
>primer_3p
AAGCAGTGGTATCAACGCAGAGTAC
/public/jychu/soft/smrtlink/smrtcmds/bin/lima --isoseq --dump-clips --peek-guess -j 20 sample1.ccs.bam IsoSeqPrimers.fasta sample1.fl.bam &
samtools view sample1.fl.primer_5p--primer_3p.bam | wc -l
540612 #Number of five-three prime reads
mv sample1.fl.* fl/
- isoseq3 refine(去除ployA和嵌合序列)
1.去除ployA和嵌合序列
/public/jychu/soft/smrtlink/smrtcmds/bin/isoseq3 refine --require-polya --min-polya-length 20 -j 4 sample1.fl.primer_5p--primer_3p.bam IsoSeqPrimers.fasta sample1.flnc.bam
mkdir flnc
mv sample1.flnc.* flnc/
2.统计全长非嵌合序列数
samtools view sample1.flnc.bam |wc -l
539221 #Number of full-length non-chimeric reads
3.统计序列长度
samtools view sample1.flnc.bam | cut -f10 | awk '{print length($1);}' | sort -n -r > sample1.flnc.length
cat sample1.flnc.length| awk '{sum+=$1} END {print "Avg =", sum/NR}'
Avg = 3413.56 #Mean length of flnc
4.统计reads长度密度分布
把sample1.flnc.length下载到文件夹C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/flnc
setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/flnc")
a<-read.table(file = "sample1.flnc.length")
a$V1<-as.integer(a$V1)
library(ggplot2)
ggplot(data=a,aes(x=V1)) +
geom_density(fill="#fdd5d3", color="#e06666", alpha=0.9,size=0.4,linetype="solid")+ #size指外部线条大小
theme_bw()+ scale_x_continuous(name="Sequence length(bp)",breaks =seq(0,12000,2000),limits=c(0, 12000))+
theme(plot.title = element_text(size = 15,face = "bold", vjust = 0.5, hjust = 0.5),panel.background = element_rect(fill = "transparent",colour = NA), panel.grid.minor = element_blank(), panel.grid.major = element_blank(),plot.background = element_rect(fill = "transparent",colour = NA))
ggsave("sample1.flnc.length.tiff",height = 4,width = 6)
1.序列聚类
mkdir clustered
/public/jychu/soft/smrtlink/smrtcmds/bin/isoseq3 cluster sample1.flnc.bamclustered/sample1.clustered.bam -j 20 --verbose --use-qvs
cd clustered
2.统计一致性转录本数目
samtools view sample1.clustered.bam | wc -l
46314 #Number of consensus isoforms(一致转录本)
samtools view sample1.clustered.hq.bam | wc -l
45663 #Number of high-quality isoforms
samtools view sample1.clustered.lq.bam | wc -l
651 #Number of polished low-quality isoforms
3.统计一致性转录本序列长度
samtools view sample1.clustered.bam | cut -f10 | awk '{print length($1);}' | sort -n -r > sample1.clustered.length
4.统计一致性转录本reads长度密度分布
下载sample1.clustered.length到目录C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/clustered
统计一致性转录本reads长度密度分布
setwd("C:/Users/TE/Desktop/全长转录组/绍梅师姐/sample1/clustered")
a<-read.table(file = "sample1.clustered.length")
a$V1<-as.integer(a$V1)
library(ggplot2)
ggplot(data=a,aes(x=V1)) +
geom_density(fill="#fdd5d3", color="#e06666", alpha=0.9,size=0.4,linetype="solid")+ #size指外部线条大小
theme_bw()+ scale_x_continuous(name="Sequence length(bp)",breaks =seq(0,12000,2000),limits=c(0, 12000))+
theme(plot.title = element_text(size = 25,face = "bold", vjust = 0.5, hjust = 0.5),panel.background = element_rect(fill = "transparent",colour = NA), panel.grid.minor = element_blank(), panel.grid.major = element_blank(),plot.background = element_rect(fill = "transparent",colour = NA))
ggsave("sample1.clustered.length.tiff",height = 4,width = 6)
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered
mkdir uncompare
cp sample1.clustered.hq.fasta.gz uncompare/
cd uncompare
gzip -d sample1.clustered.hq.fasta.gz
/public/jychu/soft/cd-hit-v4.8.1-2019-0228/cd-hit -i sample1.clustered.hq.fasta -o sample1.fasta -c 0.99 -T 30 -d 40 -M 16000 -U 10 -p 1 -G 1 -s 0.99
45013 finished 45013 clusters非冗余的高质量转录本
#预测转录本中长的开放阅读框, 默认是100个氨基酸,可以用-m修改
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
mkdir CDS
cp sample1.fasta CDS/
cd CDS
~/soft/TransDecoder-v5.5.0/TransDecoder.LongOrfs -t sample1.fasta
#使用DIAMOND对上一步输出的longest_orfs.pep在蛋白数据库中进行搜索,寻找同源证据支持
1. 使用uniprot蛋白数据库-BLASTP比对
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/CDS/sample1.fasta.transdecoder_dir
~/soft/diamond/diamond blastp -d ~/database/uniprot_sprot/uniprot_sprot -q longest_orfs.pep --evalue 1e-5 --sensitive -p 40 --max-target-seqs 1 > blastp.outfmt6
1.1 预测可能的编码区
~/soft/TransDecoder-v5.5.0/TransDecoder.Predict -t sample1.fasta --retain_blastp_hits sample0.fasta.transdecoder_dir/blastp.outfmt6
cat sample1.fasta.transdecoder.bed | cut -f1 | sort | uniq |wc -l
37632 #有CDS结构的序列数
cat sample0.fasta.transdecoder.bed | wc -l
49907 #检测出CDS区
修改文件名
mv sample1.fasta.transdecoder.bed sample1.uniport.transdecoder.bed
mv sample1.fasta.transdecoder.cds sample1.uniport.transdecoder.cds
mv sample1.fasta.transdecoder.gff3 sample1.uniport.transdecoder.gff3
mv sample1.fasta.transdecoder.pep sample1.uniport.transdecoder.pep
把50个碱基一行的序列改为该序列的所有碱基为一行
seqkit seq sample1.uniport.transdecoder.cds -w 0 > sample1.uniport.cds.fa
下载 sample1.uniport.cds.fa sample1.uniport.transdecoder.gff3 到桌面目录
==============================================================
1.2 使用自己提取出来的nr动物蛋白数据库
1.2.1 blastp比对
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/CDS/sample1.fasta.transdecoder_dir
~/soft/diamond/diamond blastp -p 40 -d ~/database/nr/nr.animal -q longest_orfs.pep --evalue 1e-5 > nr.animal.blastp.outfmt6
sort -k1,1 -k12,12nr nr.animal.blastp.outfmt6 |awk '!a[$1]++' >nr.animal.blastp.outfmt6.txt
把 nr ID转换成genesymbol 网站(https://biodbnet-abcc.ncifcrf.gov/db/db2db.php)
cat nr.animal.blastp.outfmt6.txt |cut -f2 |sort|uniq >refid.txt
下载到桌面在excel中打开,复制第一列到ID转换网站(用GeneBank Protein Accession转换)
保存为genebank.txt 上传至服务器
在nr-animal.fa中查找refid.txt对应的序列
vi refid.sh
cat refid.txt | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" ~/database/nr/nr.animal.fa >> sample1.nr.protein
done
chmod a+x refid.sh
nohup ./refid.sh & #8个小时
cat sample1.nr.protein| sort |uniq |tr ">" "\t"|cut -f2 >sample1_nr.protein #提取第一个注释到的物种信息
cat sample1_nr.protein|tr " " "\t"|awk '{$1=null;print $0}'|tr "\t" " "|sed 's/^[\t ]*//g'>nr.detil #把id去掉,并去掉首行的空格
sed -i "1d" genebank.txt #去掉首行
cat genebank.txt|cut -f1,2 >col2 #取前两列
paste col2 nr.detil | tr "\t" ":" >sample1.nr.zhushi #把文本转换成AAB96843:VCL:vinculin [Mus musculus] 的格式
rm -f col2 nr.detil nr.animal.blastp.outfmt6 #去掉没用的文件
cat nr.animal.blastp.outfmt6.txt | cut -f1,2 |tr "\t" " "|tr "." "\t"|cut -f1,2|tr "\t" "."|tr " " "\t"| sort -k2,2 >nr.animal.blastp.outfmt6.txt.2col #提取nr比对结果文件的前两列,并把.去掉,然后按照第二列的字符排序
cut -f2 nr.animal.blastp.outfmt6.txt.2col >col2 #提取按照字符排序的序列ID
从sample1.nr.zhushi中遍历col2文件的每一行字符
vi serach.sh
cat col2| while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample1.nr.zhushi >> nr.animal.blastp.outfmt6.txt.2col.col2
done
chmod a+x serach.sh
./serach.sh
cat nr.animal.blastp.outfmt6.txt.2col|cut -f1 >nr.animal.blastp.outfmt6.txt.2col.col1 #取第一列转录本ID
paste nr.animal.blastp.outfmt6.txt.2col.col1 nr.animal.blastp.outfmt6.txt.2col.col2 > sample1.nr.final.zs #合并转录本ID和其对应的nr数据库的注释信息
1.2.3 预测可能的编码区
cd ..
~/soft/TransDecoder-v5.5.0/TransDecoder.Predict -t sample1.fasta --retain_blastp_hits sample1.fasta.transdecoder_dir/sample1.nr.final.zs
cat sample1.fasta.transdecoder.bed | cut -f1 | sort | uniq |wc -l
37684 #有CDS结构的序列数
cat sample1.fasta.transdecoder.bed | wc -l
50206 #检测出50206个CDS区
修改文件名
mv sample1.fasta.transdecoder.bed sample1.nr.transdecoder.bed
mv sample1.fasta.transdecoder.cds sample1.nr.transdecoder.cds
mv sample1.fasta.transdecoder.gff3 sample1.nr.transdecoder.gff3
mv sample1.fasta.transdecoder.pep sample1.nr.transdecoder.pep
把50个碱基一行的序列改为该序列的所有碱基为一行
seqkit seq sample1.nr.transdecoder.cds -w 0 > sample1.nr.cds.fa
下载 sample1.nr.cds.fa sample1.nr.transdecoder.gff3 到桌面目录
运行前将misa.ini与fasta文件放在一起,输入的序列存在fasta文件里面,然后运行下面的命令
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
mkdir SSR
cp sample1.fasta SSR/
cd SSR
cp ~/soft/misa/misa.ini ./
perl ~/soft/misa/misa.pl sample1.fasta
less -S sample1.fasta.statistics
Total number of identified SSRs: 45532
Number of SSRs present in compound formation: 4520 #复合卫星数目
wc -l sample1.fasta.misa
41013 sample1.fasta.misa #该文件包含所有的SSR序列的信息,但是会把符合复合微卫星序列的序列信息放在一行,所以文件的行数比总微卫星数目少
41013=45532-4520+1 #+1是因为第一行是行首标题
下载sample1.fasta.misa sample1.fasta.statistics到桌面
计算SSR的密度分布(count/MB)
首先统计sample1的序列总长度
grep -v ">" sample1.fasta|wc -c
153248411
grep -v ">" sample1.fasta|wc -l
45013
总长度为153248411-45013=153203398
总Mb为153203398/1000000=153.2
---------------------------------------------------------------------------------------------------------
画密度分布图,新建文件ssrdensity.txt
type denity
C 29.50391645
P1 244.3146214
P2 21.15535248
P3 27.46083551
P4 2.950391645
P5 1.11618799
P6 0.208877285
setwd("C:/Users/TE/Desktop/绍梅师姐/sample1/uncompare/SSR")
data<-read.table(file="ssrdensity.txt",header=T,sep="\t")
library(ggplot2)
data$type=factor(data$type,levels=data$type)
CPCOLS <- c("#002fa7","#00fa9a","#008080","#ba55d3","#f0e68c","#f08080","#90ee90")
p<-ggplot(data=data,aes(x=denity,y=type,fill=type))+geom_bar(stat="identity",width=0.8)+coord_flip()+ scale_fill_manual(values = CPCOLS) + theme_bw() + theme(axis.text=element_text(face = "bold", color="gray50")) +labs(y="Period number",x="SSR count/bases(Mb)",title="SSR Density",size=20)+theme(panel.grid.major=element_blank(),panel.grid.minor= element_blank(),legend.position = "none")+theme(axis.text.x = element_text(size=13,face="bold", hjust = 0.5, vjust = 0.5))+theme(plot.title = element_text(hjust = 0.5,size=18,face="bold"))+ theme(axis.title.x = element_text(size = 15, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+theme(axis.title.y = element_text(size = 15, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+theme(axis.text.y = element_text(size=13,face="bold", hjust = 0.5, vjust = 0.5))
ggsave("SSRDensity.tiff", p, width = 14, height=10)
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
mkdir zhushi
cd zhushi
mkdir nr
mkdir uniport
mkdir kog
#用diamond BLASTX进行注释
1. nr注释
cd nr
nohup ~/soft/diamond/diamond blastx --db ~/database/nr/nr.animal -q ../../sample1.fasta --sensitive -p 40 --evalue 1e-5 -o sample1_nr_matches_fmt6.txt &
cat sample1_nr_matches_fmt6.txt| cut -f1 | sort |uniq|wc -l
37149 #比对上的转录本数目
sort -k1,1 -k12,12nr sample1_nr_matches_fmt6.txt |awk '!a[$1]++' >sample1.nr.fmt6 #提取每个转录本第12列的最大值的行
rm -f sample1_nr_matches_fmt6.txt #删除无用的文件
把 nr ID转换成genesymbol 网站(https://biodbnet-abcc.ncifcrf.gov/db/db2db.php)
cat sample1.nr.fmt6 |cut -f2 |sort|uniq >refid.txt
下载到桌面在excel中打开,复制第一列到ID转换网站(分别用GeneBank Protein Accession\RefSeq Protein Accession转换)
保存为genebank.txt 上传至服务器
在nr-animal.fa中查找refid.txt对应的序列
vi refid.sh
cat refid.txt | while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" ~/database/nr/nr.animal.fa >> sample1.nr.protein
done
chmod a+x refid.sh
nohup ./refid.sh & #6小时
cat sample1.nr.protein| sort |uniq |tr ">" "\t"|cut -f2 >sample1_nr.protein #提取第一个注释到的物种信息
cat sample1_nr.protein|tr " " "\t"|awk '{$1=null;print $0}'|tr "\t" " "|sed 's/^[\t ]*//g'>nr.detil #把id去掉,并去掉首行的空格
sed -i "1d" genebank.txt #去掉首行
cat genebank.txt|cut -f1,2 >col2 #取前两列
paste col2 nr.detil | tr "\t" ":" >sample1.nr.zhushi #把文本转换成AAB96843:VCL:vinculin [Mus musculus] 的格式
rm -f col2 nr.detil #去掉没用的文件
cat sample1.nr.fmt6 | cut -f1,2 |tr "\t" " "|tr "." "\t"|cut -f1,2|tr "\t" "."|tr " " "\t"| sort -k2,2 >nr.animal.blastp.outfmt6.txt.2col #提取nr比对结果文件的前两列,并把.去掉,然后按照第二列的字符排序
cut -f2 nr.animal.blastp.outfmt6.txt.2col |tr "." "\t" |cut -f1>col2 #提取按照字符排序的序列ID
从sample1.nr.zhushi中遍历col2文件的每一行字符
vi serach.sh
cat col2| while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w "$geneid" sample1.nr.zhushi >> nr.animal.blastp.outfmt6.txt.2col.col2
done
chmod a+x serach.sh
./serach.sh
cat nr.animal.blastp.outfmt6.txt.2col|cut -f1 >nr.animal.blastp.outfmt6.txt.2col.col1 #取第一列转录本ID
paste nr.animal.blastp.outfmt6.txt.2col.col1 nr.animal.blastp.outfmt6.txt.2col.col2 > sample1.nr.final.zs #合并转录本ID和其对应的nr数据库的注释信息,并下载到桌面
cat sample1.nr.final.zs|tr "[" "\t"|sed "s/]//g" |cut -f3|sort|uniq -c #查看比对到的物种的分类,复制到excel表里按数值降序
2.Swiss-Prot
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/zhuzhi/uniport
~/soft/diamond/diamond blastx -d ~/database/uniprot_sprot/uniprot_sprot -q ../../sample1.fasta --sensitive -p 40 --evalue 1e-5 -o sample1_uniprot_matches_fmt6.txt
sort -k1,1 -k12,12nr sample1_uniprot_matches_fmt6.txt |awk '!a[$1]++' >sample1.uniport.fmt6
rm -f sample1_uniprot_matches_fmt6.txt
wc -l sample1.uniport.fmt6
35577 #比对上的转录本数目
3.KEGG 在线比对参考https://www.jianshu.com/p/48716fa7321b
grep "K" query.ko.txt|cut -f1 >kegg.zhushi 18953个注释到的转录本
4. KOG数据库
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/zhuzhi/kog
~/soft/diamond/diamond blastx -d ~/database/KOG/kyva -q ../../sample1.fasta --sensitive --evalue 1e-5 -o sample1_kog_matches_fmt6.txt
35203 queries aligned. #比对上的转录本数目
sort -k1,1 -k12,12nr sample1_kog_matches_fmt6.txt |awk '!a[$1]++' >sample1_kog.fmt6 #下载到桌面
rm -f sample1_kog_matches_fmt6.txt
cat sample1_kog.fmt6|cut -f1,2 >kog.id
从kog.txt中遍历kog.id文件的第二列的每一行字符
vi serachkog.sh
cat kog.id| while read id;
do
arr=(${id})
geneid=${arr[1]}
grep -w "$geneid" /public/jychu/database/KOG/kog.txt >> kog.id.group
done
chmod a+x serachkog.sh
./serachkog.sh
wc -l kog.id.group
28977 #一共有28977个kogID可以注释到功能层级
cat kog.id.group|tr "]" "\t"|cut -f1>sample1.kog.group #提取功能描述的编号
cat sample1.kog.group|tr "\n" ","|sed "s/,//g" >row1 #把所有字符放在一行
wc -c row1 32764-1=32763个字符
把每一个字符分成一行
for((i=1;i<=32763;i++));
do
cut -b $i row1 >>sample1.kog.group.1row;
done
cat sample1.kog.group.1row|sort|uniq -c #统计每个字符(Group 分类)出现的次数
1577 A
711 B
577 C
1079 D
559 E
326 F
706 G
162 H
863 I
1111 J
2079 K
694 L
175 M
114 N
2685 O
493 P
163 Q
4991 R
2172 S
5311 T
2431 U
281 V
1005 W
27 X
424 Y
2047 Z
--------------------------------------------------
绘图
setwd("C:/Users/TE/Desktop/绍梅师姐/sample1/uncompare/数据库注释/kog")
library(ggplot2)
data<- read.table("kog.picture.txt",header=T, sep="\t")
CPCOLS <- c("#8DA1CB", "#FD8D62", "#66C3A5","#db7093")
data$Term=factor(data$Term,levels=data$Term)
p <- ggplot(data=data, aes(y=Term, x=number, fill=group)) +
geom_bar(stat="identity", width=0.8) + coord_flip() +
scale_fill_manual(values = CPCOLS) + theme_bw() +
theme(axis.text=element_text(face = "bold", color="gray50"))+labs(title = "KOG Function Classification of Consensus Sequence",y = "Function Class",x = "Frequency")+theme(panel.grid.major =element_blank(), panel.grid.minor = element_blank())+theme(legend.position = "right")+theme(plot.title = element_text(hjust = 0.5,size=17,face="bold"))+theme(axis.title.x = element_text(size = 14, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+theme(axis.title.y = element_text(size = 14, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+
guides(fill = guide_legend(title = NULL))+theme(axis.text.x = element_text(angle = 288,size=12,face="bold", hjust = 0.02, vjust = 0.02))
ggsave("KOGpicture.tiff", p, width = 12, height=13)
========================================================
进入桌面:
cd nr
cut -f1 sample1.nr.final.zs >nr.zhushi
cut -f1 ../kog/sample1_kog.fmt6 >../kog/kog.zhushi
cut -f1 ../uniport/sample1.uniport.fmt6 > ../uniport/uniport.zhushi
cat nr.zhushi ../uniport/uniport.zhushi ../kog/kog.zhushi ../kegg/kegg.zhushi|sort|uniq|wc -l
37215 #至少有一个数据库注释成功的转录本数目
1.CAPT预测
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
mkdir lncRNA
cd lncRNA
cpat.py -x /public/jychu/soft/CPAT/goose/goose_Hexamer.tsv -d /public/jychu/soft/CPAT/goose/goose.logit.RData -g ../sample1.fasta -o sample1.goose.CAPT.out
2. 统计nr数据库和uniport数据库没有注释到的转录本id
grep ">" ../sample1.fasta|sed "s/>//g"|tr " " "\t"|cut -f1 >transcript.all #提取所有的转录本id名称
cut -f1 ../zhushi/uniport/sample1.uniport.fmt6 >uniport.trans #提取uniport注释到的转录本id
cat transcript.all uniport.trans|sort|uniq -u > uniport.untrans #提取uniport未比对到的转录本id 9436个
rm -f uniport.trans #删除无用文件
cut -f1 ../zhushi/nr/sample1.nr.fmt6 >nr.trans #提取nr注释到的转录本id
cat nr.trans transcript.all |sort|uniq -u > nr.untrans #提取nr未比对到的转录本id 7864个
rm -f nr.trans
cd lncRNA
awk '$2>=200' sample1.goose.CAPT.out.ORF_prob.tsv |cut -f1|tr "_" "\t"|cut -f1|sort|uniq > capt.alltrans #提取ORF长度大于200的所有转录本id
awk '$10>=0.1 && $2>=200' sample1.goose.CAPT.out.ORF_prob.tsv|cut -f1|tr "_" "\t"|cut -f1|sort|uniq > goose.unlncRNA #提取编码潜力大于0.1的转录本id
cat goose.unlncRNA capt.alltrans|sort|uniq -u > capt.untrans #提取编码潜力小于0.1,ORF长度大于200的转录本id 11281个
rm -f capt.alltrans goose.unlncRNA
下载 uniport.untrans nr.untrans capt.untrans 到桌面
3.用CPC2软件预测编码潜能
conda activate python2 #cpc2脚本只能在python2环境下使用
python /public/jychu/soft/CPC2-beta/bin/CPC2.py -i ../sample1.fasta -o sample1.cpc2 #运行命令
sample1.cpc2.txt #输出文件
grep "noncoding" sample1.cpc2.txt |cut -f1 >sample1.cpc2.lnc #把预测为没有编码能力的转录本ID提取出来,下载到桌面
wc -l sample1.cpc2.lnc
12831 sample1.cpc2.lnc #一共预测到12831个没有编码能力的转录本
--------------------------------------------------------------------------------------------
在https://bioinfogp.cnb.csic.es/tools/venny/index.html 网站绘制韦恩图
把交集的序列保存为sample1.lnc.all,在服务器中新建sample1.lnc.all文件,并把序列粘贴上去(如果直接把文件上传上去,shell脚本识别不了该文件),下载到桌面上
conda deactivate
---------------------------------------------------------------------------------------------
4.构建鹅的lncRNA数据库索引,用blast进行比对
cd /public/jychu/reference/index/hisat/goose
makeblastdb -in lncRNA.name.fa -parse_seqids -dbtype nucl -out lncrna.db #建立索引
提取出所有的lncRNA序列
vi lnc.all.sh
cat sample1.lnc.all| while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -w -A1 "$geneid" ../sample1.fasta >> sample1.lnc.all.fa #下载到桌面上
done
chmod a+x lnc.all.sh
./lnc.all.sh &
blastn -query sample1.lnc.all.fa -out sample1.goose.blast -db /public/jychu/reference/index/hisat/goose/lncrna.db -outfmt 6 -evalue 1e-5 -num_threads 40 #lncran序列与鹅的已知lncrna数据库进行比对
sort -k1,1 -k12,12nr sample1.goose.blast |awk '!a[$1]++' >sample1.goose.lnc #提取每个转录本第12列的最大值的行,下载到桌面上
rm -f sample1.goose.blast #删除无用的文件
cut -f1 sample1.goose.lnc|sort|uniq|wc -l
230 #一共有230个转录本比对到鹅的lncRNA上
cut -f2 sample1.goose.lnc|sort|uniq|wc -l
124 #一共比对上124个lncRNA
seqkit seq sample1.goose.CAPT.out.ORF_seqs.fa -w 0 > sample1.goose.CAPT.ORF.fa #把序列转换成一行
sed -i "s/_/ /g" sample1.goose.CAPT.ORF.fa #把行首标签替换字符,以便下面脚本中序列名称的匹配
vi lnc.all.orf.sh #提取lncRNA预测的ORF序列
cat sample1.lnc.all| while read id;
do
arr=(${id})
geneid=${arr[0]}
grep -i -w -A1 "$geneid" sample1.goose.CAPT.ORF.fa >> sample1.lncrna.ORF.fa #下载到桌面上
done
chmod a+x lnc.all.orf.sh
./lnc.all.orf.sh
grep ">" sample1.lnc.all.fa|tr "=" "\t"|cut -f3 >sample1.lnc.all.length #获取序列长度信息,并下载到桌面上
画序列长度分布图
setwd("C:/Users/TE/Desktop/绍梅师姐/sample1/uncompare/lncRNA")
a<-read.table(file = "sample1.lnc.all.length")
a$V1<-as.integer(a$V1)
library(ggplot2)
ggplot(data=a,aes(x=V1)) +
geom_histogram(binwidth=200,fill="#6498C8", color="#e9ecef", alpha=0.9)+
theme_bw()+ scale_x_continuous(name="Read Length",limits=c(0, 15000))+
scale_y_continuous(name="count")
ggsave("sample1.lnc.all.length.tiff",height = 4,width = 8)