写在前面,因为没有做任何延时处理,为了保护网站,还请不要重复运行代码!!!
最近帮女朋友搞生信,tbtools理化性质分析怎么都搞不好,网站又只能一个一个分析,一气之下自己写一个爬虫来批量计算理化性质!!!!!
貌似生信人用简书多一点,希望能帮到。
使用方法很简单,有python环境就行,把后面所有函数复制一下,然后把我加粗的地方换成自己的就好了。
有其他疑问可以email:haoli@stu.jsu.edu.cn
import requests
import re
from bs4 import BeautifulSoup
from Bio import SeqIO
from lxml import etree
import json
import csv
import logging
###获取Proparam网页信息
def get_protparam_results(sequence):
url = "https://web.expasy.org/cgi-bin/protparam/protparam"
payload = {
'sequence': sequence,
'mandatory': ''
}
# 发送POST请求
response = requests.post(url, data=payload)
return response
###分割fasta序列,批量输入
from Bio import SeqIO
def parse_fasta(file_path):
sequences = []
geneid=[]
for record in SeqIO.parse(file_path, "fasta"):
a=str(record.id)+"\n"+str(record.seq)
geneid.append(str(record.id))
sequences.append(a)
return sequences,geneid
###正则表达式提取字符串中的数字
def extract_numbers(text):
result=''
# 定义正则表达式模式以匹配整数和小数
pattern = re.compile(r'-?\d+\.?\d*')
# 使用re.findall提取所有匹配的数字
result = re.findall(pattern, text)[0]
return result
###提取所需要的信息
def extract_result(gene,geneid,web):
soup = BeautifulSoup(web, 'html.parser')
# Extracting the sequence data
sequence = soup.find('pre').text.strip()
tree = etree.HTML(web)
# Extracting protein details
details = {}
details['Number of amino acids'] = extract_numbers(tree.xpath('/html/body/main/div/pre[2]/text()[1]')[0])
details['Molecular weight'] = extract_numbers(tree.xpath('/html/body/main/div/pre[2]/text()[2]')[0])
details['Theoretical pI'] = extract_numbers(tree.xpath('/html/body/main/div/pre[2]/text()[3]')[0])
strong_tags = soup.find_all('strong')
details['Instability index']=extract_numbers(strong_tags[-3].next_sibling.strip())
details['Aliphatic index']=extract_numbers(strong_tags[-2].next_sibling.strip())
details['Grand average of hydropathicity']=extract_numbers(strong_tags[-1].next_sibling.strip())
gene[geneid]=details
def json_to_csv(json_data, output_file):
# 获取列名(基因ID)
gene_ids = list(json_data.keys())
# 获取行名(属性名)
attributes = list(json_data[gene_ids[0]].keys())
# 打开CSV文件以写入
with open(output_file, 'w', newline='') as csvfile:
csvwriter = csv.writer(csvfile)
# 写入表头
header = ['geneid'] + attributes
csvwriter.writerow(header)
# 写入数据
for gene_id in gene_ids:
row = [gene_id] + [json_data[gene_id][attr] for attr in attributes]
csvwriter.writerow(row)
####主函数
##这里替换成你的fasta文件路径即可
file_path=r'这里放你的序列的fasta文件'
sequences_227,geneid=parse_fasta(file_path)
gene={}
# results = get_protparam_results(sequences_227[0])
# extract_result(gene,geneid[0],results.text)
# i
for i in range(len(sequences_227)):
results = get_protparam_results(sequences_227[i])
extract_result(gene,geneid[i],results.text)
logging.info('%s has finished!'%(geneid[i]))
json_to_csv(gene, r'这里放你要保存结果的路径\output.csv')