file="Orthogroups/Orthogroups.tsv"
ogs={}
with open(file) as f:
l = 0
for line in f:
l+=1
line=[line.strip() for line in line.split('\t')]
if l == 1:
species = line[1:]
continue
z = [z.split(', ') for z in line[1:]]
ogs[line[0]] = dict(zip(species,z))
>>> len(ogs)
20667
>>> a = ogs["OG0020664"]
>>> a
{'Averrhoa_carambola': [''], 'Carica_papaya': [''], 'Coffea_canephora': [''], 'Prunus_avium': [''], 'Prunus_persica': [''], 'Ricinus_communis': [''], 'Theobroma_cacao': [''], 'Vitis_vinifera': ['GSVIVG01006304001', 'GSVIVG01006461001']}
>>> a['Vitis_vinifera']
['GSVIVG01006304001', 'GSVIVG01006461001']
持续更新......
针对OrthoFinder的结果Results_*/Orthogroups/Orthogroups.tsv
进行处理。
Python学的太差了,给自己定几个题目,进行学习:
- 任意提取某个OG下某个物种的所有基因
- 统计各个物种特有OG下的基因
- 统计单拷贝OG
- 统计各个OG单拷贝率
...
随着代码的掌握,进行优化。
singleogs=[]
a=[]
for og,spgenes in ogs.items(): # 每个OG进行一次循环
i=0
for sp,genes in spgenes.items(): # 每个OG的每个物种进行一次循环
genes = [ i for i in genes if i != '']
if len(genes) == 1:
i+=1 # 如果物种的基因数目等于1,i加1
if i == len(ogs[og]): # 最终i的数目等于 len(ogs[og])
singleogs.append(og)
>>> len(singleogs)
4298
>>> ogs[singleogs[1]]
{'Averrhoa_carambola': ['geneYangtao2006611'], 'Carica_papaya': ['110807233'], 'Coffea_canephora': ['Cc02_g35740'], 'Prunus_avium': ['gene-LOC110759850'], 'Prunus_persica': ['18784265'], 'Ricinus_communis': ['J2O13_05G011667'], 'Theobroma_cacao': ['Thecc1EG019630'], 'Vitis_vinifera': ['GSVIVG01036485001']}