拿着一个DEGs的list和基因functional annotation竟然没有找到简单的脚本一健生成Go或者Kegg的富集结果,啊这!(可能是我懒得找吧)
有需求就自己写一个好了,基于超几何分布和 Benjamini - Hochberg的P值矫正方法。
有关数学得详情点击这个https://guangchuangyu.github.io/cn/2012/04/enrichment-analysis/
需要原料:
这个类似这样功能注释的格式的表。
DEGs的list
依赖包:
import sys
from scipy import stats
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
statsr = importr('stats')
主要代码:
两个函数
def goread(goingo):#Go counts
totalgenenum = 0
filego = open(goingo,"r")
linessgo = filego.readlines()
linessgo = list(linessgo)
dicgonum = {}
dicgenegolist = {}
for i in linessgo:
totalgenenum +=1
i = i.strip()
i =i.split()
#print(i)
dicgenegolist[i[0]] = []
try:
for j in i[1].split(";"):
if j.find("GO") != -1: #only for Go
j = j.strip()
dicgenegolist[i[0]].append(j)
if j not in list(dicgonum.keys()):
dicgonum[j] = 1
else:
dicgonum[j] +=1
except:
continue
print(dicgenegolist)
return (dicgonum,dicgenegolist,totalgenenum)
def enrichment(m,n,a,b):#enrichment test adjustment
pvalue = stats.hypergeom.sf(m,n,a,b)
return pvalue
输入输出统计啥的:
goin = sys.argv[1]
goingo = sys.argv[2]
goout = sys.argv[3]
gogoout = open(goout,"w")
file2 = open(goin,"r")
liness = file2.readlines()
liness = list(liness)
dicgonum, dicgenegolist, totalgenenum = goread(goingo)
pvaluelist = []
mapnum =0
dicquerygonum = {}
samplenum = 0
print(dicgenegolist)
dicquerylist =[]
for i in liness:
i = i.strip()
samplenum +=1
try:
for j in dicgenegolist[i]:
if j not in dicquerylist:
dicquerygonum[j] = 1
dicquerylist.append(j)
else:
dicquerygonum[j] += 1
mapnum +=1
except:
continue
if mapnum == 0:
print("No gene mapped, check you data!")
else:
print(mapnum,"genes mapped")
for i in dicquerygonum.keys():
m = dicquerygonum[i]-1
n = totalgenenum
a = dicgonum[i]
b = mapnum
pvalue = stats.hypergeom.sf(m,n,a,b)
pvaluelist.append(pvalue)
P值矫正(BH)
p_adjust = statsr.p_adjust(FloatVector(pvaluelist), method = 'BH') #P value adjustment
输出:
diclist = list(dicquerygonum.keys())
for i in range(len(list(dicquerygonum.keys()))):
name = diclist[i]
oa = dicquerygonum[diclist[i]]
ob = mapnum
oc = dicgonum[diclist[i]]
od = totalgenenum-dicgonum[diclist[i]]
op = pvaluelist[i]
opadj = p_adjust[i]
print(name,oa,ob,oc,od,op,opadj,file = gogoout)
gogoout.close()
输出样品:
完整版看看Github:https://github.com/lifangpings/enrichmentgo.py