class WaterMelon:
def __init__(self,gtf,fasta):
self.gtf = gtf
self.fasta = fasta
def chromoLen(self):
chrLen = {}
for rec in SeqIO.parse(self.fasta,'fasta'):
chrLen[rec.id] = len(rec.seq)
return chrLen
def gcContent(self,gc_window):
self.gc_window = gc_window
gc_content = {'chr_id':[],
'bin_start':[],
'gc':[]}
chr_len = self.chromoLen()
#print(chr_len)
for rec in SeqIO.parse(self.fasta,'fasta'):
for i in range(0,chr_len[rec.id],self.gc_window):
#print(rec.id)
gc_content['chr_id'].append(rec.id)
gc_content['bin_start'].append(i)
gc_content['gc'].append(round(GC(rec.seq[i:i+self.gc_window]),2))
return pd.DataFrame(gc_content)
def geneDensity(self,gene_window):
self.gene_window = gene_window
final_df = []
df = pd.read_table(self.gtf,header=None,comment="#",sep="\t",
usecols=[0,2,3,4],
names="Chromosome Feature Start End".split())
#df.columns = "Chromosome Source Feature Start End Score Strand Frame Attribute".split()
df = df[df.Feature=="gene"]
chrLen = self.chromoLen()
for chr_id in chrLen.keys():
print(chr_id)
df1 = df[df.Chromosome==chr_id]
gene_start = [int(a) for a in df1.Start]
gene_start.insert(0,0)
gene_start.append(round(chrLen[chr_id]/self.gene_window)*self.gene_window+self.gene_window)
#print(gene_start)
bin_start = [int(a.left) for a in pd.cut(gene_start,bins=round(chrLen[chr_id]/self.gene_window)+1).value_counts().index]
bin_start[0] = 0
gene_count = list(pd.cut(gene_start,bins=round(chrLen[chr_id]/self.gene_window)+1).value_counts().values)
#print(len(bin_start))
#print(len(gene_count))
#print("OK")
final_df.append(pd.DataFrame({'chr_id':chr_id,'bin_start':bin_start,'gene_count':gene_count}))
return pd.concat(final_df)
用到了pandas和biopython模块
import pandas as pd
from Bio import SeqIO
from Bio.SeqUtils import GC
mp=WaterMelon(fasta="../Molecular_Plant/G42.fasta",
gtf="../Molecular_Plant/G42.gene.annotation.gff3")
dfgenedensity = mp.geneDensity(gene_window=500000)
dfgc = mp.gcContent(gc_window=500000)