参考:https://genome.sph.umich.edu/wiki/BamUtil:_diff
安装BamUtil:
conda install -c bioconda bamutil
或
conda install -c bioconda/label/cf201901 bamut
准备文件{bam2}
bam diff --in1 ${bam1} --in2 ${bam2} --mapQual --onlyDiffs > ${bamdiff}.txt
结果比较:
导入包:
import os, subprocess
import pandas as pd
定义函数整理文件为pandas的dataframe:
def GetBamdiffArr(Bamdifftxt):
BamdiffArr = []
SeqIDs = []
Bam1Dic, Bam2Dic = {}, {}
with open(Bamdifftxt, 'r') as fr:
lines = fr.readlines()
for line in lines:
if line.startswith(">"):
items = line.strip().split()
flag, pos, cigar, mapqual = "", "", "", ""
for index, item in enumerate(items[1:]):
if index == 0:
flag = item
elif not str.isalnum(item):
pos = item
elif str.isdigit(item):
mapqual = item
else:
cigar = item
Bam1Dic[key] = [flag, pos, cigar, mapqual]
elif line.startswith("<"):
items = line.strip().split()
flag, pos, cigar, mapqual = "", "", "", ""
for index, item in enumerate(items[1:]):
if index == 0:
flag = item
elif not str.isalnum(item):
pos = item
elif str.isdigit(item):
mapqual = item
else:
cigar = item
Bam2Dic[key] = [flag, pos, cigar, mapqual]
else:
key = line.strip()
SeqIDs.append(key)
for key in SeqIDs:
try:
flag1, pos1, cigar1, mapqual1 = Bam1Dic[key]
tag1 = 1
except KeyError:
flag1, pos1, cigar1, mapqual1 = "", "", "", ""
tag1 = 0
try:
flag2, pos2, cigar2, mapqual2 = Bam2Dic[key]
tag2 = -1
except KeyError:
flag2, pos2, cigar2, mapqual2 = "", "", "", ""
tag2 = 0
tag = tag1+tag2
BamdiffArr.append([key,flag1, pos1, cigar1, mapqual1, tag1, flag2, pos2, cigar2, mapqual2, tag2, tag])
return(BamdiffArr)
定义函数写文件:
def WritFile(GetBamdiffArr, samplename):
BamdiffDF = pd.DataFrame(BamdiffArr,columns = ["key", "flag1", "pos1", "cigar1", "mapqual1", "tag1", "flag2", "pos2", "cigar2", "mapqual2", "tag2", "tag"])
BamdiffDF[['mapqual1','mapqual2']] = BamdiffDF[['mapqual1','mapqual2']].apply(pd.to_numeric)
BamdiffDF["tag"][(BamdiffDF.loc[:,"pos2"]=="")&(BamdiffDF.loc[:,"cigar2"]=="")&(BamdiffDF.loc[:,"tag"]==1)] = "Bam1only"
BamdiffDF["tag"][(BamdiffDF.loc[:,"pos1"]=="")&(BamdiffDF.loc[:,"cigar1"]=="")&(BamdiffDF.loc[:,"tag"]==-1)] = "Bam2only"
BamdiffDF["tag"][((BamdiffDF.loc[:,"mapqual1"]<20)&(BamdiffDF.loc[:,"mapqual2"]<20))&(BamdiffDF.loc[:,"tag"]==0)] = "poorquality"
BamdiffDF["tag"][(BamdiffDF.loc[:,"pos1"]=="")&(BamdiffDF.loc[:,"cigar1"]=="")&(BamdiffDF.loc[:,"pos2"]=="")&(BamdiffDF.loc[:,"cigar2"]=="")&(BamdiffDF.loc[:,"tag"]!="poorquality")] = "match"
BamdiffDF["tag"][(BamdiffDF.loc[:,"pos1"]=="")&(BamdiffDF.loc[:,"cigar1"]=="")&(BamdiffDF.loc[:,"tag"]!="poorquality")] = "match"
BamdiffDF["tag"][BamdiffDF.loc[:,"tag"]==0] = "mismatch"
pd.DataFrame(BamdiffDF).to_csv(f"./{samplename}.csv", index=False)
TagUniqCounts = pd.Series(BamdiffDF.loc[:,"tag"]).value_counts()
pd.DataFrame(TagUniqCounts).to_excel(f"./{samplename}.counts.xlsx", sheet_name='sheet1')
执行函数:
samplename = Bamdifftxt.strip(".txt")
BamdiffArr = GetBamdiffArr(Bamdifftxt)
WritFile(GetBamdiffArr, samplename)
最后根据xlsx文件,可以将结果整理成venn图。