今天我想用Linux重新统计一下每对通用引物能扩增出多少目科属种鸟类
首先用excel将rtfd中所有鸟类名称筛出来,列名为NAME
分别为12sspecies_list.xlsx |1cytb_list.xlsx | 2cytbspecies_list.xlsx | co1species_list.xlsx
其中用到excel中的分列及合并
合并的公式为 =A1 & “ ” & B1
import pandas as pd
from Bio import Entrez
from time import sleep
# 设置你的邮箱(NCBI要求提供邮箱地址)
Entrez.email = "maha2082@163.com"
# 从Excel读取鸟类的拉丁文名
input_file = '1cytb_list.xlsx' # 替换Excel文件名
df = pd.read_excel(input_file, sheet_name=0)
# 创建一个新列来存储结果
df['Order'] = ''
df['Family'] = ''
df['Genus'] = ''
df['Species'] = ''
# 定义一个函数来查询NCBI数据库
def fetch_taxonomy(name):
try:
handle = Entrez.esearch(db="taxonomy", term=name, retmode="xml")
record = Entrez.read(handle)
if record['IdList']:
taxon_id = record['IdList'][0]
tax_handle = Entrez.efetch(db="taxonomy", id=taxon_id, retmode="xml")
tax_record = Entrez.read(tax_handle)
lineage = tax_record[0]['LineageEx']
order = family = genus = species = ''
for item in lineage:
rank = item.get('Rank')
name = item.get('ScientificName')
if rank == 'order':
order = name
elif rank == 'family':
family = name
elif rank == 'genus':
genus = name
species = tax_record[0]['ScientificName']
return order, family, genus, species
except Exception as e:
print(f"Error fetching data for {name}: {e}")
return '', '', '', ''
# 遍历每个物种进行查询
for i, row in df.iterrows():
species_name = row['NAME']
print(f"Fetching data for {species_name}...")
order, family, genus, species = fetch_taxonomy(species_name)
df.at[i, 'Order'] = order
df.at[i, 'Family'] = family
df.at[i, 'Genus'] = genus
df.at[i, 'Species'] = species
sleep(0.5) # 为了避免被NCBI服务器封禁,添加延迟
# 保存结果到新的Excel文件
output_file = '1cytbbird_species_classification.xlsx'#修改文件名
df.to_excel(output_file, index=False)
print(f"结果已保存到 {output_file}")
#在目录中保存为classify.py,修改完文件名,直接运行即可
python3 classify.py
生成的excel文件为:
12sbird_species_classification.xlsx
1cytbbird_species_classification.xlsx
...
将excel按目科属种排序,补充空缺内容,删除重复物种
找到重复的步骤为:选中species列,格式,条件格式,新建规则,仅对唯一值或重复值设置格式,格式选择颜色,点击确定,再次点击确定。