Python学习的第九天,主要学习如何写自己的函数功能。
1. 从PDB文件中提取fasta序列
import struct
pdb_format = '6s5s1s4s1s3s1s1s4s1s3s8s8s8s6s6s10s2s3s'
amino_acids = {
'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E',
'PHE':'F', 'GLY':'G', 'HIS':'H', 'LYS':'K',
'ILE':'I', 'LEU':'L', 'MET':'M', 'ASN':'N',
'PRO':'P', 'GLN':'Q', 'ARG':'R', 'SER':'S',
'THR':'T', 'VAL':'V', 'TYR':'Y', 'TRP':'W'
}
def threeletter2oneletter(residues):
for i, threeletter in enumerate(residues):
residues[i][0] = amino_acids[threeletter[0]]
#这将氨基酸的三个字母的代码转换为一个字母的代码。
#该函数读取[res_type,chain]对的列表,
#并用对应的单字母代码替换氨基酸三字母代码的第一个元素。
def get_residues(pdb_file):
residues = []
for line in pdb_file:
if line[0:4] == "ATOM":
tmp = struct.unpack(pdb_format, line.encode('utf-8'))
tmp_to_string = [line.decode('utf-8') for line in tmp]
ca = tmp_to_string[3].strip()
if ca == 'CA':
res_type = tmp_to_string[5].strip()
chain = tmp_to_string[7]
residues.append([res_type, chain])
return residues
def write_fasta_records(residues, pdb_id, fasta_file):
seq = ''
chain = residues[0][1]
for aa, new_chain in residues:
if new_chain == chain:
seq = seq + aa
else:
fasta_file.write(">%s_%s\n%s\n" % (pdb_id, chain, seq))
seq = aa
chain = new_chains
fasta_file.write(">%s_%s\n%s\n" % (pdb_id, chain, seq))
def extract_sequence(pdb_id):
pdb_file = open(pdb_id + ".pdb")
fasta_file = open(pdb_id + ".fasta", "w")
residues = get_residues(pdb_file)
threeletter2oneletter(residues)
write_fasta_records(residues, pdb_id, fasta_file)
pdb_file.close()
fasta_file.close()
extract_sequence("3G5U")
2. 写一个极简单的函数
def calc_sum(num1, num2):
'''一个名为calc_sum函数,定义函数功能,对两个参数求和,然后返回出结果'''
result = num1 + num2
return result
calc_sum(23,7) #calc_sum计算23和7的和
Out[6]: 30
###直接返回函数运行后的结果
def increment(number):
'''返回给定的数字加1的结果'''
return number + 1
increment(5) #给定5,返回值为6
Out[8]: 6
###直接打印出给定的参数
def print_arg(number):
print(number)
print_arg(5) #输入5,打印出5
5
###上述两个函数的套用
print_arg(increment(5)) #先是increment(5)函数,对5进行加1操作,然后print_arg()直接打印出increment(5)函数运行后的结果
3. 使用struct模块将字符串转为元组
import struct
format = "1s2s1s1s" #1s表示一个字符为一个单位, 2s则表示两个字符为一个单位
line = "12345"
col = struct.unpack(format, line.encode("utf-8") #将line这个字符串以format的格式拆分开
)
col_to_string = [line.decode("utf-8") for line in col]
print(col)
(b'1', b'23', b'4', b'5')
print(col_to_string)
['1', '23', '4', '5']
###calcsize()函数这个是统计format格式中有多少个字符
format = '30s30s20s1s'
print(struct.calcsize(format))
81
4. 将任意数量的参数转化为tab符分隔的字符串
def convert_to_string(*args):
result = [str(a) for a in args]
return "\t".join(result) + "\n"
output_file = open("nucleotideSubstitMatrix.txt", "w")
output_file.write(convert_to_string('', 'A', 'T', 'C', 'G'))
Out[17]: 9
output_file.write(convert_to_string('A', 1.0))
Out[18]: 6
output_file.write(convert_to_string('T', 0.5, 1.0))
Out[19]: 10
output_file.write(convert_to_string('C', 0.1, 0.1, 1.0))
Out[20]: 14
output_file.write(convert_to_string('G', 0.1, 0.1, 0.5, 1.0))
Out[21]: 18
output_file.close()
5. 提取PDB文件中特定的残基
import struct
from parse_pdb import parse_atom_line
def main(pdb_filename, residues, output_filename):
pdb = open(pdb_filename)
outfile = open(output_filename, "w")
for line in pdb:
if line.startswith("ATOM"):
chain, res_type, res_num, atom, x, y, z = parse_atom_line(line)
for aa, num in residues:
if res_type == aa and res_num == num:
outfile.write(line)
outfile.close()
residues = [('ASP', '102'), ('HIS', '57'), ('SER', '195')]
main("1TLD.pdb", residues, "trypsin_triad.pdb")
6. 计算两个原子之间的距离
#计算PDB文件中两个原子间的距离
from math import sqrt
from distance import calc_dist
from parse_pdb import parse_atom_line
pdb = open('3G5U.pdb')
points = []
while len(points) < 2:
line = pdb.readline()
if line.startswith("ATOM"):
chain, res_type, res_num, atom, x, y, z = parse_atom_line(line)
if res_num == '123' and chain == 'A' and atom == 'CA':
points.append((x, y, z))
if res_num == '209' and chain == 'A' and atom == 'CA':
points.append((x, y, z))
print(calc_dist(points[0], points[1]))
35.219262627147664
####计算PDB中所有CA原子的距离
from math import sqrt
from distance import calc_dist
from parse_pdb import parse_atom_line
def get_ca_atoms(pdb_filename):
pdb_file = open(pdb_filename, "r") #提取ca原子的数据列表
ca_list = []
for line in pdb_file:
if line.startswith('ATOM'):
data = parse_atom_line(line)
chain, res_type, res_num, atom, x, y, z = data
if atom == 'CA' and chain == 'A':
ca_list.append(data)
pdb_file.close()
return ca_list
ca_atoms = get_ca_atoms("1TLD.pdb")
for i, atom1 in enumerate(ca_atoms):
# save coordinates in a variable
name1 = atom1[1] + atom1[2]
coord1 = atom1[4:]
# compare atom1 with all other atoms
for j in range(i+1, len(ca_atoms)):
atom2 = ca_atoms[j]
name2 = atom2[1] + atom2[2]
coord2 = atom2[4:]
# calculate the distance between atoms
dist = calc_dist(coord1, coord2)
print(name1, name2, dist)