bash get.longest.gff.sh input.gff > longest.gff
cat get.longest.gff.sh
#!/bin/bash
# 用法检查
if [ $# -ne 1 ]; then
echo "Usage: $0 input.gff" >&2
exit 1
fi
gff=$1
# 提取每个mRNA的CDS长度
awk '$3=="CDS" {
match($9, /Parent=([^;]+)/, a);
transcript=a[1];
len=($5 - $4 + 1);
cdslen[transcript] += len
}
END {
for (t in cdslen) print t, cdslen[t]
}' "$gff" > transcript_cds_lengths.txt
# 建立转录本到基因的映射
awk '$3=="mRNA" {
match($9, /ID=([^;]+)/, a);
tid=a[1];
match($9, /Parent=([^;]+)/, b);
gid=b[1];
print gid, tid
}' "$gff" > gene_transcript_map.txt
# 找出每个基因最长的转录本
awk '
NR==FNR {cdslen[$1]=$2; next}
{
gene=$1; tid=$2;
if (cdslen[tid] > maxlen[gene]) {
maxlen[gene] = cdslen[tid];
longest[gene] = tid
}
}
END {
for (g in longest) print longest[g]
}' transcript_cds_lengths.txt gene_transcript_map.txt > longest_transcripts.txt
# 提取对应转录本及其相关信息
awk -v ids="longest_transcripts.txt" '
BEGIN {
while ((getline < ids) > 0) {
keep[$1] = 1
}
}
{
if ($3=="gene") {
match($9, /ID=([^;]+)/, a);
current_gene = a[1];
gene_line = $0;
gene_needed = 0;
} else if ($3=="mRNA") {
match($9, /ID=([^;]+)/, a);
current_tx = a[1];
if (keep[current_tx]) {
print gene_line;
print $0;
gene_needed = 1;
} else {
gene_needed = 0;
}
} else if ($3=="exon" || $3=="CDS") {
match($9, /Parent=([^;]+)/, a);
if (keep[a[1]]) print $0;
}
}' "$gff"
有的时候还会遇到gtf格式,
bash get.longest.gtf.sh input.gff > longest.gtf
cat get.longest.gtf.sh
#!/bin/bash
# 用法说明
if [ $# -ne 1 ]; then
echo "Usage: $0 input.gtf" >&2
exit 1
fi
gtf=$1
# Step 1: 统计每个 transcript 的 exon 总长度
awk '$3=="exon" {
match($0, /gene_id "([^"]+)"/, a);
gene=a[1];
match($0, /transcript_id "([^"]+)"/, b);
transcript=b[1];
len=($5 - $4 + 1);
exon_len[transcript] += len;
transcript2gene[transcript] = gene;
}
END {
for (t in exon_len) {
print transcript2gene[t], t, exon_len[t]
}
}' "$gtf" > gtf.transcript_lengths.txt
# Step 2: 按 gene_id 筛出最长的 transcript_id
awk '
{
gene=$1; transcript=$2; len=$3
if (len > maxlen[gene]) {
maxlen[gene] = len
best[gene] = transcript
}
}
END {
for (g in best) print best[g]
}' gtf.transcript_lengths.txt > gtf.longest_transcripts.txt
# Step 3: 提取 GTF 文件中对应的注释行
awk '
BEGIN {
while ((getline < "gtf.longest_transcripts.txt") > 0) {
keep[$1] = 1
}
}
{
match($0, /transcript_id "([^"]+)"/, b);
tx = b[1];
if ($3 == "gene") {
gene_line = $0
match($0, /gene_id "([^"]+)"/, a);
current_gene = a[1];
printed_gene[current_gene] = 0;
}
else if ($3 == "transcript" && keep[tx]) {
match($0, /gene_id "([^"]+)"/, a);
gene_id = a[1];
if (!printed_gene[gene_id]) {
print gene_line;
printed_gene[gene_id] = 1;
}
print;
current_tx = tx;
}
else if (($3 == "exon" || $3 == "CDS") && keep[tx]) {
print;
}
}' "$gtf"