- 在此介绍一个可直接筛选的脚本
select_longest_transcripts.pl
1 #!/usr/bin/perl -w
2 use strict;
3 die "perl $0 <genomic.gff|genomic.gff.gz> <all.cds|all.cds.gz>\n" unless @ARGV == 2;
4
5 ##### example inputs: GCA_002891405.2_Csec_1.0_genomic.gff.gz, GCA_002891405.2_Csec_1.0_cds_from_genomic.fna.gz #####
6
7 my $in = shift;
8 my $pool = shift;
9
10 ##### get the longest transcript of each mRNA #####
11
12 if( $in =~ /.*\.gz$/ ){
13 open IN, "gzip -dc $in | " || die "$!\n";
14 }else{
15 open IN, $in || die "$!\n";
16 }
17
18 my ($ref, $gene, $trans);
19 while( <IN> ){
20 chomp;
21 next if /^$/;
22 next if /#/;
23 my @tmp = split(/\t/, $_);
24 if( $tmp[2] eq "gene" ){
25 $gene = $1 if $tmp[8] =~ /ID=(.*);Name.*/;
26 }
27 if( $tmp[2] eq "mRNA" ){
28 $trans = $1 if $tmp[8] =~ /ID=(.*);Parent.*/;
29 }
30 if( $tmp[2] eq "CDS" ){
31 my ($cds, $parent) = ($1, $2) if $tmp[8] =~ /ID=cds\.evm\.model\.(.*);Parent=(.*)/;
32 $ref->{$gene}{$cds} += $tmp[4] - $tmp[3] if $parent eq $trans;
33 }
34 }
35 close IN;
36
37 open O1, ">$in.all.mRNA.len.info.xls" || die "$!\n";
38 open O2, ">$in.all.longst_mRNA.info.xls" || die "$!\n";
39 for my $key ( sort keys %$ref ){
40 my $gene = $ref->{$key};
41 print O1 $key;
42 print O2 $key;
43 for my $mrna ( sort { $gene->{$a} <=> $gene->{$b} } keys %$gene ){
44 print O1 "\t$mrna\t$gene->{$mrna}\n";
45 }
46 my @sort = sort { $gene->{$a} <=> $gene->{$b} } keys %$gene;
47 print O2 "\t$sort[-1]\t$gene->{$sort[-1]}\n";
48 }
49 close O1;
50 close O2;
51
52 ##### get longest mRNAs from the all mRNAs pool #####
53
54 my (%Seq, $id);
55 if( $pool =~ /.*\.*gz$/ ){
56 open CDS, "gzip -dc $pool | " || die "$!\n";
57 }else{
58 open CDS, $pool || die "$!\n";
59 }
60
61 while( <CDS> ){
62 chomp;
63 #if( />.*_cds_(.*)_\d+\s+\[\S+.*/ ){
64 if( />evm\.model\.(.*)\s+.*/ ){
65 $id = $1;
66 }else{
67 $Seq{$id} .= $_;
68 }
69 }
70 close CDS;
71
72 open IN, "$in.all.longst_mRNA.info.xls" || die "$!\n";
73 open OUT, ">$in.longest_transcript.cds.fa" || die "$!\n";
74 while( <IN> ){
75 chomp;
76 my @tmp = split(/\s+/, $_);
77 print OUT ">$tmp[1]\n";
78 for(my $i = 0; $i <= length($Seq{$tmp[1]}); $i += 80){
79 print OUT substr($Seq{$tmp[1]}, $i, 80), "\n";
80 }
81 }
82 close IN;
83 close OUT;
重点改写第25, 28, 31, 64正则匹配.gff文件和.cds文件里的信息
- 筛选之前先检查所下载的gff文件中是否存在多转录本的情况
awk '$3=="gene"' *gff | wc -l
awk '$3=="mRNA"' *gff | wc -l
wc -l *xls
- 问题报错排查
Use of uninitialized value $tmp[2] in string eq at select_longest_transcripts_v2_Tachypleus_tridentatus.pl line 24, <IN> line 200027.
Use of uninitialized value $tmp[2] in string eq at select_longest_transcripts_v2_Tachypleus_tridentatus.pl line 27, <IN> line 200027.
Use of uninitialized value $tmp[2] in string eq at select_longest_transcripts_v2_Tachypleus_tridentatus.pl line 30, <IN> line 200027.
- 添加
print
参数进行排查
##### get the longest transcript of each mRNA #####
if( $in =~ /.*\.gz$/ ){
open IN, "gzip -dc $in | " || die "$!\n";
}else{
open IN, $in || die "$!\n";
}
my ($ref, $gene, $trans);
while( <IN> ){
chomp;
next if /^$/;
next if /#/;
my @tmp = split(/\t/, $_);
if( $tmp[2] eq "gene" ){
$gene = $1 if $tmp[8] =~ /ID=(.*);.*/;
print "gene\t$gene\n";
}
if( $tmp[2] eq "mRNA" ){
$trans = $1 if $tmp[8] =~ /ID=(.*);Parent.*/;
print "trans\t$trans\n";
}
if( $tmp[2] eq "CDS" ){
my ($cds, $parent) = ($1, $2) if $tmp[8] =~ /ID=cds\.evm\.model\.(.*);Parent=(.*)/;
print "cds\t$cds\n"; print "parent\t$parent\n";
$ref->{$gene}{$cds} += $tmp[4] - $tmp[3] if $parent eq $trans;
}
}
close IN;
open O1, ">$in.all.mRNA.len.info.xls" || die "$!\n";
open O2, ">$in.all.longst_mRNA.info.xls" || die "$!\n";
这个案例中的报错排查的结果是下载的gff文件中有空格行的原因,在脚本中添加了
next if /^$/;
后得到解决
可供选择的第二种方法,用OrthoFinder自带的脚本进行挑选(仅针对ensembl下载数据)
- 请参考OrthoFinder
The files from Ensembl will contain many transcripts per gene. If we ran OrthoFinder on these raw files it would take 10x longer than necessary and could lower the accuracy. We’ll use a script provided with OrthoFinder to extract just the longest transcript variant per gene and run OrthoFinder on these files:
for f in *fa ; do python ~/orthofinder_tutorial/OrthoFinder/tools/primary_transcript.py $f ; done
import os
import sys
from collections import Counter, defaultdict
# Use the 'all' version rather than ab initio
def ScanTags(fn):
"""
For ensembl genomes, look for tag:id and count repeated ids
:param fn:
:return:
"""
tags = set()
tokens = []
with open(fn, 'r') as infile:
for line in infile:
if not line.startswith(">"): continue
tokens.append([t.split(":", 1) for t in line.rstrip().split() if ":" in t])
tags.update([t[0] for t in tokens[-1]])
for this_tag in tags:
print(this_tag)
# print(tokens[-1])
c = Counter([idd for acc in tokens for t, idd in acc if t == this_tag])
print(c.most_common(5))
print("")
def ScanTags_NCBI(fn):
genes = []
with open(fn, 'r') as infile:
for line in infile:
if not line.startswith(">"): continue
genes.append(line[1:].split(".", 1)[0])
print("%d sequences, %d genes" % (len(genes), len(set(genes))))
def ScanTags_with_fn(fn, gene_name_fn):
genes = []
with open(fn, 'r') as infile:
for line in infile:
if not line.startswith(">"): continue
genes.append(gene_name_fn(line))
print("%d sequences, %d genes" % (len(genes), len(set(genes))))
# print(genes[0])
# print(sorted(genes)[:10])
def GetGeneName(acc_line):
tokens = [(t.split("=") if "=" in t else t.split(":"))[1] for t in acc_line.rstrip().split() if ("gene:" in t or "gene=" in t)]
if len(tokens) != 1: return None
return tokens[0]
def CreatePrimaryTranscriptsFile(fn, dout, gene_name_fn=GetGeneName):
# Get genes and lengths
max_gene_lens = defaultdict(int)
with open(fn, 'r') as infile:
lines = [l.rstrip() for l in infile]
N = len(lines) - 1
nAcc = 0
nGeneUnidentified = 0
acc_to_use = defaultdict(str)
iLine = -1
while iLine < N:
iLine += 1
line = lines[iLine]
if not line.startswith(">"): continue
nAcc += 1
iLineAcc = iLine
gene = gene_name_fn(line)
if gene == None:
nGeneUnidentified += 1
continue
# get length
l = 0
while iLine < N:
iLine += 1
line = lines[iLine]
if line.startswith(">"):
iLine -= 1
break
l += len(line.rstrip())
if l > max_gene_lens[gene]:
max_gene_lens[gene] = l
acc_to_use[gene] = iLineAcc
print("Found %d accessions, %d genes, %d unidentified transcripts" % (nAcc, len(max_gene_lens), nGeneUnidentified))
# print(gene)
# print(sorted(max_gene_lens.keys())[:10])
# print(len(set(max_gene_lens.keys())))
# Get longest version for each gene
# Parse file second time and only write out sequences that are longest variant
nGenesWriten = 0
outfn = dout + os.path.basename(fn)
with open(outfn, 'w') as outfile:
iLine = -1
while iLine < N:
iLine += 1
line = lines[iLine]
if not line.startswith(">"): continue
gene = gene_name_fn(line)
# transcripts not identifying the gene should be written
if gene != None and iLine != acc_to_use[gene]: continue
acc_line_out = line + "\n" if gene == None else ">%s\n" % gene
nGenesWriten += 1
outfile.write(acc_line_out)
while iLine < N:
iLine += 1
line = lines[iLine]
if line.startswith(">"):
iLine -= 1
break
outfile.write(line + "\n")
print("Wrote %d genes" % nGenesWriten)
if nGenesWriten != len(max_gene_lens) + nGeneUnidentified:
print("ERROR")
raise Exception
print(outfn)
def last_dot(text):
return text[1:].rstrip().rsplit(".", 1)[0]
def space(text):
return text[1:].rstrip().split(None, 1)[0]
function_dict = {"last_dot":last_dot, "space":space}
def main(args=None):
if args is None:
args = sys.argv[1:]
fn = args[0]
dout = os.path.dirname(os.path.abspath(fn)) + "/primary_transcripts/"
if not os.path.exists(dout):
os.mkdir(dout)
if len(sys.argv) == 3:
gene_name_function_name = function_dict[sys.argv[2]]
ScanTags_with_fn(fn, gene_name_function_name)
CreatePrimaryTranscriptsFile(fn, dout, gene_name_function_name)
else:
# ScanTags(fn)
# ScanTags_NCBI(fn)
# ScanTags_second_dot(fn)
CreatePrimaryTranscriptsFile(fn, dout)
if __name__ == "__main__":
args = sys.argv[1:]
main(args)
可供选择的第三种方法,用BITACORA自带的脚本Scripts/Tools/exclude_isoforms_fromfasta.pl 进行挑选
- 请参考BITACORA
#!/usr/bin/perl
use strict;
use warnings;
# Script to cluster genes (longest gene as representative) in overlapping positions (being putative isoforms or bad-annotations)
# usage: perl exclude_isoforms_fromfasta.pl file.gff3 file.fasta
die "Script to cluster genes (keeping the longest sequence as representative) in overlapping positions, i.e. clustering isoforms \n\nperl exclude_isoforms_fromfasta.pl file.gff3 file.fasta \n\n" unless @ARGV == 2;
my ($line, $name, $nameout);
my %overlap; my %mrna; my %fasta;
my $outname="";
if ($ARGV[1] =~ /(\S+)\.fa/){
$outname = $1;
} else {
$outname = $ARGV[1];
}
open (Fasta , "<", $ARGV[1]);
while (<Fasta>) {
chomp;
my $line = $_;
next if ($line !~ /\S+/);
if ($line =~ />(\S+)/){
$name = $1;
} else {
$fasta{$name} .= $line;
}
}
close Fasta;
open (Results, ">", "$ARGV[0]\_overlapping_genes.txt");
open (GFFfile , "<", $ARGV[0]);
while (<GFFfile>) {
chomp;
$line = $_;
next if ($line !~ /\S+/);
next if ($line =~ /^#/);
my @subline = split (/\t/, $line);
if ($subline[2] =~ /CDS/){
####### No needed in this script
next;
#######
my $genename = "";
if ($subline[8] =~ /Parent=([^;]+)/){
#if ($subline[8] =~ /transcript_id \"(\S+)\"/){
$genename = $1;
}
else {die "ERROR in get_overlapping_genes_fromgff.pl: It fails detecting Parent ID in $line\n";}
if ($genename =~ /(\S+)\_\d+dom/){
$genename = $1;
}
}
elsif ($subline[2] =~ /mRNA/){
next if ($line =~ /^END/);
my $genename = "";
if ($subline[8] =~ /ID=([^;]+)/){
#if ($subline[8] =~ /transcript_id \"(\S+)\"/){
$genename = $1;
}
else {print "ERROR in get_overlapping_genes_fromgff.pl: It fails detecting ID in $line\n";}
# if ($genename =~ /(\S+)\_\d+dom/){
# $genename = $1;
# }
#Overlap checking
my $start = "";
my $end = "";
my $frame = "$subline[6]";
$frame =~ s/\+/\\\+/g;
$frame =~ s/\-/\\\-/g;
if ($subline[4] > $subline[3]){
$start = $subline[3];
$end = $subline[4];
} else {
$start = $subline[4];
$end = $subline[3];
}
foreach my $key (keys %{$mrna{$subline[0]}}){
my @subl2 = split (/\s/, $mrna{$subline[0]}{$key});
next if ($subl2[2] !~ /$frame/); # First check chain
my $start2 = $subl2[0];
my $end2 = $subl2[1];
my $over = 0;
if ($start <= $start2 && $end >= $end2){
$over++
} elsif ($start2 <= $start && $end2 >= $end){
$over++
} elsif ($start <= $start2 && $end >= $start2){
$over++
} elsif ($start2 <= $start && $end2 >= $start){
$over++
}
if ($over > 0){
#if (exists $overlap{$genename}){
# $overlap{$genename} .= "$key ";
#} elsif (exists $overlap{$key}){
# $overlap{$key} .= "$genename ";
#} else {
$overlap{$subline[0]}{$genename} .= "$key ";
$overlap{$subline[0]}{$key} .= "$genename ";
#}
}
}
$mrna{$subline[0]}{$genename} = "$start $end $frame";
}
elsif ($subline[2] =~ /gene/){
####### No needed in this script
next;
#######
next if ($line =~ /^END/);
my $genename = "";
if ($subline[8] =~ /ID=([^;]+)/){
#if ($subline[8] =~ /transcript_id \"(\S+)\"/){
$genename = $1;
}
else {print "ERROR in get_overlapping_genes_fromgff.pl: It fails detecting ID in $line\n";}
if ($genename =~ /(\S+)\_\d+dom/){
$genename = $1;
}
}
}
close GFFfile;
my %overlapgood;
foreach my $scaf (sort keys %overlap){
my $exclude;
foreach my $key (sort keys %{$overlap{$scaf}}){
#print "$key $overlap{$scaf}{$key}\n"; #### Debug
my @subl = split (/\s/, $overlap{$scaf}{$key});
if (exists $overlapgood{$scaf}{$key}){
foreach my $sgen (@subl){
my $list = $overlapgood{$scaf}{$key};
#$list =~ s/\./\\\./g;
$list =~ s/\|/\\\|/g;
if ($list =~ /$sgen /){
next;
} else {
$overlapgood{$scaf}{$key} .= "$sgen ";
}
}
} else {
my $cont = 0;
foreach my $sgen (@subl){
if (exists $overlapgood{$scaf}{$sgen}){
$cont++;
my $list = "$sgen $overlapgood{$scaf}{$sgen}";
#$list =~ s/\./\\\./g;
$list =~ s/\|/\\\|/g;
if ($list !~ /$key /){
$overlapgood{$scaf}{$sgen} .= "$key ";
}
foreach my $ssgen (@subl){
if ($list =~ /$ssgen /){
next;
} else {
$overlapgood{$scaf}{$sgen} .= "$ssgen ";
}
}
}
}
if ($cont == 0){
$overlapgood{$scaf}{$key} .= "$overlap{$scaf}{$key}";
} elsif ($cont > 1){
#print "Warning, two different fields have been saved for the same gene in $key $overlap{$scaf}{$key}\n";
}
}
}
}
my %overlapgood2;
foreach my $scaf (sort keys %overlapgood){
my $exclude;
foreach my $key (sort keys %{$overlapgood{$scaf}}){
#print "$key $overlapgood{$scaf}{$key}\n"; #### Debug
my @subl = split (/\s/, $overlapgood{$scaf}{$key});
if (exists $overlapgood2{$key}){
foreach my $sgen (@subl){
my $list = $overlapgood2{$key};
#$list =~ s/\./\\\./g;
$list =~ s/\|/\\\|/g;
if ($list =~ /$sgen /){
next;
} else {
$overlapgood2{$key} .= "$sgen ";
}
}
} else {
my $cont = 0;
foreach my $sgen (@subl){
if (exists $overlapgood2{$sgen}){
$cont++;
my $list = "$sgen $overlapgood2{$sgen}";
#$list =~ s/\./\\\./g;
$list =~ s/\|/\\\|/g;
if ($list !~ /$key /){
$overlapgood2{$sgen} .= "$key ";
}
foreach my $ssgen (@subl){
if ($list =~ /$ssgen /){
next;
} else {
$overlapgood2{$sgen} .= "$ssgen ";
}
}
}
}
if ($cont == 0){
$overlapgood2{$key} .= "$overlapgood{$scaf}{$key}";
} elsif ($cont > 1){
#print "Warning, two different fields have been saved for the same gene in $key $overlapgood{$scaf}{$key}\n";
}
}
}
}
my $excluded = "0";
open (Resultsfa, ">", "$outname\_noiso.fasta");
open (Resultsiso, ">", "$outname\_isoforms_excluded.txt");
open (Resultsisotab, ">", "$outname\_isoforms_table.txt");
print Resultsisotab "Representative sequence\tIsoforms\n";
foreach my $key (sort keys %overlapgood2){
print Results "$key $overlapgood2{$key}\n";
$overlapgood2{$key} .= "$key ";
my $bseq = "firstseq";
my $blen = "0";
my $excludedseqs = "";
my @split = split(/\s/, $overlapgood2{$key});
foreach my $seq (@split){
unless (exists ($fasta{$seq})) {
next;
die "Can't find $seq in fasta file\n";
}
my $length = length($fasta{$seq});
if ($length > $blen){
$blen = $length;
unless ($bseq =~ /firstseq/){
print Resultsiso "$bseq\n";
$excludedseqs .= "$bseq ";
delete $fasta{$bseq};
$excluded++;
}
$bseq = $seq;
} else {
print Resultsiso "$seq\n";
$excludedseqs .= "$seq ";
delete $fasta{$seq};
$excluded++;
}
}
next if ($excludedseqs !~ /\S+/);
print Resultsisotab "$bseq\t$excludedseqs\n";
}
foreach my $key (sort keys %fasta) {
print Resultsfa ">$key\n$fasta{$key}\n";
}
close Results;
close Resultsfa;
close Resultsiso;
print "\nDone, $excluded sequences have been excluded. Output files have been saved in $outname\_noiso.fasta\n\n";