计算相关性的基本步骤分为三步:
1,链接neo4j数据库,并且读取出里面的数据
2,对齐算法运算
3,拿到运算结果设定一个阀值,来判断大于阀值的就是相关。
直接上代码:
第一步:
#导入工具包
import numpy as np
from neo4j.v1 import GraphDatabase, basic_auth, kerberos_auth, custom_auth, TRUST_ALL_CERTIFICATES
##链接数据库,建立driver。
driver = GraphDatabase.driver("***********", auth=basic_auth(“neo4j”,“neo4j”), trust=TRUST_ALL_CERTIFICATES)
session = driver.session()
#读出数数据并且append到一个列表里面
dat = session.run(“MATCH (m)-[r]->(n) RETURN m.name, r.relation, n.name LIMIT 5”)
blists = []
for d in dat:
bs = str(d[0])
blists.append(bs)
第二步:
#把列表里面的数据,拿出来俩俩运算
for i in range(len(blists)):
for j in range(0,i):
a = blists[i]
b = blists[j]
print(blists[i],blists[j])
td = Jaccrad(a, b)
std =edit_distance(a, b)/max(len(a),len(b))
fy = 1-std
#取2个算法的平均值
huizon = (td+fy)/2
print('avg_sim: ', huizon)
#编辑距离算法
def edit_distance(word1, word2):
len1 = len(word1)
len2 = len(word2)
dp = np.zeros((len1 + 1, len2 + 1))
for i in range(len1 + 1):
dp[i][0] = i
for j in range(len2 + 1):
dp[0][j] = j
for i in range(1, len1 + 1):
for j in range(1, len2 + 1):
delta = 0 if word1[i - 1] == word2[j - 1] else 1
dp[i][j] = min(dp[i - 1][j - 1] + delta, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1))
return dp[len1][len2]
#Jaccrad 算法
def Jaccrad(terms_model,reference):
grams_reference = set(reference)
grams_model = set(terms_model)
temp = 0
for i in grams_reference:
if i in grams_model:
temp = temp + 1
fenmu = len(grams_model) + len(grams_reference) - temp
jaccard_coefficient = float(temp / fenmu)
return jaccard_coefficient
测试结果:
福州市委,福建省福州市委
avg_sim: 0.6190476190476191
福州市委,福建省委
avg_sim: 0.41666666666666663
第三步:
huizon = (jacd+edit)/2
if 0.70 < huizon < 1: # 取一个值的区间
#判断a 和 b 的长度,目的是留下字数长的实体,然后把字数短的实体去掉。
if len(a) > len(b): # 如果a实体比b实体要长 ,删除b
rel_a,rel_b = deleted(a, b, sd)
if rel_a == rel_b: # 如果关系一样 直接删除
session.run("match(n) where n.name=’%s’ detach delete n " % (b))
else: # 如果关系不一样先把关系拿出来迁移到另外一个相似的实体上,然后在进行删除。
rels = session.run(“MATCH (n)-[r]->(m) where n.name=’%s’ and m.name=’%s’ RETURN r.relation” % (b, sd))
rls = rels.values()[0][0]
if rls:
session.run("MATCH§,(q) where p.name=’%s’and q.name=’%s’ create unique §-[:%s {relation:’%s’}]->(q) " % (a, sd, rls, rls))
session.run("match(n) where n.name=’%s’ detach delete n " % (b))
else: # 如果b实体比a实体要长 ,删除a
rel_a, rel_b = deleted(a, b, sd)
if rel_a == rel_b:
session.run("match(n) where n.name=’%s’ detach delete n " % (a))
else:
rels = session.run(“MATCH (n)-[r]->(m) where n.name=’%s’ and m.name=’%s’ RETURN r.relation” % (a, sd))
rlsd = rels.values()[0][0]
if rlsd:
session.run("MATCH§,(q) where p.name=’%s’and q.name=’%s’ create unique §-[:%s {relation:’%s’}]->(q) " % (b, sd, rlsd, rlsd))
session.run("match(n) where n.name=’%s’ detach delete n " % (a))
————————————————
原文链接:https://blog.csdn.net/for_yayun/article/details/100971894
https://blog.csdn.net/w5688414/article/details/103262721