利用python中的pysam模块做一些简单的统计(4可能是最后一篇)

今天要连发两篇文章，那是因为...算了不矫情了
下面献上我最后的作品吧

#coding=utf-8
import pysam
import re
import os

class Gene:
    def __init__(self,samfile,path_in):        ##继承外部全局变量samfile
        self.samfile = samfile
        self.path_in = path_in
    def GC_squence_read(self,number):  ##讀取每行的數據,傳入讀取行數
        n = 0
        for line in samfile:
            n += 1
            print(n,line)
            if n == number:
                break
    def GC_total_read(self,squence_number): ##用于计算GC含量的功能模块
        number = 0
        total_GC_MAX = 0 #GC含量最大值計數
        total_GC_MIN = 1 #GC含量最小值計數
        number_lose= []  #提取失敗序列統計
        for line in samfile:
            number += 1
            if number < squence_number:
                squence = (re.findall("[AGCT][AGCT][AGCT].*[AGCT][AGCT][AGCT]",str(line))) #提取目標序列
                str_squence = " ".join(squence)
                total_squence =len(str_squence) #統計序列長度
                if total_squence < 400: #這裏是因爲曬篩除則中有一些提取出問題的序列,原因不明
                    print("序列長度：",total_squence)
                    find_G = re.findall("G",str_squence)
                    total_fin_G =len(find_G) #統計G數量
                    print("C數量",total_fin_G)
                    find_C = re.findall("C",str_squence)
                    total_fin_C = len(find_C) #統計C數量
                    print("G數量",total_fin_C)
                    total_GC = (int(total_fin_G )+ int(total_fin_C))/int(total_squence) #計算GC含量
                    print(number,total_GC)
                    if total_GC > total_GC_MAX:
                        total_GC_MAX = total_GC #判断GC含量
                        number_MAX = number
                    if total_GC_MIN > total_GC: #判断GC含量最小值
                        total_GC_MIN = total_GC
                        number_MIN = number
                else:
                    number_lose.append(number)

            if number == squence_number:
                break
        print("統計條數%s,最大GC含量%s,編號%s,最小GC含量%s,編號%s,"%(squence_number,total_GC_MAX,number_MAX,total_GC_MIN,number_MIN))
        print("提取失敗序列合計%s條,編號爲%s："%(len(number_lose),number_lose))
    def pysam_save(self,squence_number):  #用于提取序列的功能模块,输出TXT
        number =0
        compile = re.compile("/桌面/(.*).bam")
        filename = re.findall(compile,path_in)
        print(filename)
        for line in samfile:
            number += 1
            if number < squence_number:
                squence = (re.findall("[AGCT][AGCT][AGCT].*[AGCT][AGCT][AGCT]", str(line)))  # 提取目標序列
                str_squence = " ".join(squence)
                if len(str_squence) < 400:
                    with open("bam_squence", "a") as pysam_txt:
                        pysam_txt.write(str_squence)
                        pysam_txt.write("\n")
                        #print(os.path.abspath("bam_squence.txt"))
            else:
                ospath = "/home/charmflystar/Desktop/save_file_for_pycharm/gene_squence/bam_squence"
                ospath_new = "/home/charmflystar/Desktop/save_file_for_pycharm/gene_squence/%s"% filename
                os.rename(ospath, ospath_new)
                print("执行完毕")
                break
    def AGCT_accuracy(self,squence_number): #用于统计A\G\C\T的碱基精准度,智能化输出到TXT
        number = 0
        for line in samfile:
            number += 1
            if number < squence_number:#控制输出结果条数,方便测试
                squence = re.findall("[AGCT][AGCT][AGCT].*[AGCT][AGCT][AGCT]",str(line)) #提取目標序列
                str_squence = " ".join(squence)#需要转化为str
                total_squence =len(str_squence) #統計序列長度
                if total_squence < 400: #這裏是因爲曬篩除則中有一些提取出問題的序列,原因不明
                    compile1 = re.compile("[A][A][A][A][A*]") #根据需求抓取含A的连续序列
                    A_squence = str(re.findall(compile1,str_squence))#同样需转化为str,以便以文本格式保存
                    compile2 = re.compile("/桌面/(.*).bam")#1.这段代码是用来智能化命名文本名称的
                    name = re.findall(compile2, path_in)#2.这段代码是用来智能化命名文本名称的
                    filename =" ".join(name)#3.这段代码是用来智能化命名文本名称的
                    #print(A_squence)
                    if "A" in A_squence:
                        with open("bam_squence", "a") as pysam_txt: #把筛出的符合序列写到记事本中
                            pysam_txt.write(str_squence)
                            pysam_txt.write("\n")
            else:
                ospath = "/home/charmflystar/Desktop/save_file_for_pycharm/gene_squence/bam_squence"
                ospath_new = "/home/charmflystar/Desktop/save_file_for_pycharm/gene_squence/%s" % filename #4.这段代码是用来智能化命名文本名称的
                os.rename(ospath, ospath_new)
                print("执行完毕")
                break
if __name__ == '__main__':
        path_in = "/home/charmflystar/桌面/LibPrep71_rawlib.bam"  # 導入BAM文件地址
        samfile = pysam.AlignmentFile(path_in, "rb")
        f1 = Gene(samfile,path_in) #實例化多個BAM文件
        #f1.GC_total_read(1000000)   #參數爲需要統計的條數,以统计前100W条序列为例
        #f1.GC_squence_read(20)  #參數爲讀取的條數,用於測試
        #f1.pysam_save(10)    #参数为需要保存的行数,用于抽取BAM文件中的所有序列,例如,抽取前10条序列,输出TXT文本.没事别用这个,试了一下我输出200W条序列,容量达到了惊人的300MB
        #f1.AGCT_accuracy(1000000)   #参数为需要统计的条数，以统计100W条序列为例

不知不觉敲了100多行代码，一步一脚印，我坚持过，努力过。敲代码带给了我快乐，我的确喜欢这份工作，尽管处处bug,但代码通了那一瞬间的喜悦..也就只有敲过的人才会懂。给我发需求的人已经离职了，这个代码也没有必要继续写下去了，不过我觉得，未来充满可能，你我都在这个社会中挣扎着，希望不要让社会改变自己，大家都要努力!

面向百度和书本编程

利用python中的pysam模块做一些简单的统计(4可能是最后一篇)

推荐阅读更多精彩内容