使用python处理生物信息数据（九）

Python学习的第九天，主要学习如何写自己的函数功能。

1. 从PDB文件中提取fasta序列

import struct

pdb_format = '6s5s1s4s1s3s1s1s4s1s3s8s8s8s6s6s10s2s3s'

amino_acids = {
    'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E',
    'PHE':'F', 'GLY':'G', 'HIS':'H', 'LYS':'K',
    'ILE':'I', 'LEU':'L', 'MET':'M', 'ASN':'N',
    'PRO':'P', 'GLN':'Q', 'ARG':'R', 'SER':'S',
    'THR':'T', 'VAL':'V', 'TYR':'Y', 'TRP':'W'
    }

def threeletter2oneletter(residues):
    for i, threeletter in enumerate(residues):
        residues[i][0] = amino_acids[threeletter[0]] 
#这将氨基酸的三个字母的代码转换为一个字母的代码。 
#该函数读取[res_type，chain]对的列表，
#并用对应的单字母代码替换氨基酸三字母代码的第一个元素。

def get_residues(pdb_file):
    residues = []
    for line in pdb_file:
        if line[0:4] == "ATOM":
            tmp = struct.unpack(pdb_format, line.encode('utf-8'))
            tmp_to_string = [line.decode('utf-8') for line in tmp]
            ca = tmp_to_string[3].strip()
            if ca == 'CA':
                res_type = tmp_to_string[5].strip()
                chain = tmp_to_string[7]
                residues.append([res_type, chain])
    return residues

def write_fasta_records(residues, pdb_id, fasta_file):
    seq = ''
    chain = residues[0][1]
    for aa, new_chain in residues:
        if new_chain == chain:
            seq = seq + aa
        else:
            fasta_file.write(">%s_%s\n%s\n" % (pdb_id, chain, seq))
            seq = aa
            chain = new_chains
    fasta_file.write(">%s_%s\n%s\n" % (pdb_id, chain, seq))

def extract_sequence(pdb_id):
    pdb_file = open(pdb_id + ".pdb")
    fasta_file = open(pdb_id + ".fasta", "w")
    residues = get_residues(pdb_file)
    threeletter2oneletter(residues)
    write_fasta_records(residues, pdb_id, fasta_file)
    pdb_file.close()
    fasta_file.close()
    
extract_sequence("3G5U")

2. 写一个极简单的函数

def calc_sum(num1, num2):
    '''一个名为calc_sum函数，定义函数功能，对两个参数求和，然后返回出结果'''
    result = num1 + num2
    return result
    
calc_sum(23,7)  #calc_sum计算23和7的和
Out[6]: 30

###直接返回函数运行后的结果
def increment(number):
    '''返回给定的数字加1的结果'''
    return number + 1
    
increment(5) #给定5，返回值为6
Out[8]: 6

###直接打印出给定的参数
def print_arg(number):
    print(number)
    
print_arg(5) #输入5，打印出5
5

###上述两个函数的套用
print_arg(increment(5)) #先是increment(5)函数，对5进行加1操作，然后print_arg()直接打印出increment(5)函数运行后的结果

3. 使用struct模块将字符串转为元组

import struct

format = "1s2s1s1s" #1s表示一个字符为一个单位， 2s则表示两个字符为一个单位

line = "12345"

col = struct.unpack(format, line.encode("utf-8") #将line这个字符串以format的格式拆分开
)

col_to_string = [line.decode("utf-8") for line in col]

print(col)
(b'1', b'23', b'4', b'5')

print(col_to_string)
['1', '23', '4', '5']

###calcsize()函数这个是统计format格式中有多少个字符
format = '30s30s20s1s'

print(struct.calcsize(format))
81

4. 将任意数量的参数转化为tab符分隔的字符串

def convert_to_string(*args):
    result = [str(a) for a in args]
    return "\t".join(result) + "\n"

output_file = open("nucleotideSubstitMatrix.txt", "w")

output_file.write(convert_to_string('', 'A', 'T', 'C', 'G'))
Out[17]: 9

output_file.write(convert_to_string('A', 1.0))
Out[18]: 6

output_file.write(convert_to_string('T', 0.5, 1.0))
Out[19]: 10

output_file.write(convert_to_string('C', 0.1, 0.1, 1.0))
Out[20]: 14

output_file.write(convert_to_string('G', 0.1, 0.1, 0.5, 1.0))
Out[21]: 18

output_file.close()

5. 提取PDB文件中特定的残基

import struct

from parse_pdb import parse_atom_line

def main(pdb_filename, residues, output_filename):
    pdb = open(pdb_filename)
    outfile = open(output_filename, "w")
    for line in pdb:
        if line.startswith("ATOM"):
            chain, res_type, res_num, atom, x, y, z = parse_atom_line(line)
            for aa, num in residues:
                if res_type == aa and res_num == num:
                    outfile.write(line)
    outfile.close()

residues = [('ASP', '102'), ('HIS', '57'), ('SER', '195')]
main("1TLD.pdb", residues, "trypsin_triad.pdb")

6. 计算两个原子之间的距离

#计算PDB文件中两个原子间的距离
from math import sqrt
from distance import calc_dist
from parse_pdb import parse_atom_line

pdb = open('3G5U.pdb')
points = []

while len(points) < 2:
    line = pdb.readline()
    if line.startswith("ATOM"):
        chain, res_type, res_num, atom, x, y, z = parse_atom_line(line)
        if res_num == '123' and chain == 'A' and atom == 'CA':
            points.append((x, y, z))
        if res_num == '209' and chain == 'A' and atom == 'CA':
            points.append((x, y, z))


print(calc_dist(points[0], points[1]))
35.219262627147664

####计算PDB中所有CA原子的距离
from math import sqrt
from distance import calc_dist
from parse_pdb import parse_atom_line

def get_ca_atoms(pdb_filename):
    pdb_file = open(pdb_filename, "r") #提取ca原子的数据列表
    ca_list = []
    for line in pdb_file:
        if line.startswith('ATOM'):
            data = parse_atom_line(line)
            chain, res_type, res_num, atom, x, y, z = data
            if atom == 'CA' and chain == 'A': 
                ca_list.append(data)
    pdb_file.close()
    return ca_list

ca_atoms = get_ca_atoms("1TLD.pdb")
for i, atom1 in enumerate(ca_atoms):
    # save coordinates in a variable
    name1 = atom1[1] + atom1[2]
    coord1 = atom1[4:]
    # compare atom1 with all other atoms
    for j in range(i+1, len(ca_atoms)):
        atom2 = ca_atoms[j]
        name2 = atom2[1] + atom2[2]
        coord2 = atom2[4:]
        # calculate the distance between atoms
        dist = calc_dist(coord1, coord2)
        print(name1, name2, dist)

最后编辑于：2020.03.25 15:40:06

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 215,539评论 6赞 497
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 91,911评论 3赞 391
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 161,337评论 0赞 351
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 57,723评论 1赞 290
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 66,795评论 6赞 388
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 50,762评论 1赞 294
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 39,742评论 3赞 416
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 38,508评论 0赞 271
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 44,954评论 1赞 308
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 37,247评论 2赞 331
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 39,404评论 1赞 345
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 35,104评论 5赞 340
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 40,736评论 3赞 324
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 31,352评论 0赞 21
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 32,557评论 1赞 268
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 47,371评论 2赞 368
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 44,292评论 2赞 352

使用python处理生物信息数据（九）

1. 从PDB文件中提取fasta序列

2. 写一个极简单的函数

3. 使用struct模块将字符串转为元组

4. 将任意数量的参数转化为tab符分隔的字符串

5. 提取PDB文件中特定的残基

6. 计算两个原子之间的距离

推荐阅读更多精彩内容