# 项目思路是根据用户点赞过的帖子提取文本标签(主要是地理位置)。根据tags进行匹配帖子用作推荐。
# -*- coding:utf-8 -*-
# 此脚本获得User_Likes_Label;获得用户点赞帖子的地理标签
from pymongo import MongoClient
from bson import ObjectId
import jieba.analyse
import json
import re
# 读取标签库加入list
place = {}
file = open('place.txt',encoding = 'utf-8')
for line in file.readlines():
line = line.strip()
place[line] = 1
# 设置idf词库
jieba.analyse.set_idf_path('idfall.txt')
# 连接mogno数据库
host = '47.94.14.59'
client = MongoClient('47.94.14.59', 27017)
print(client)
db = client['aituwen']
mycol = db['user_likes']
posts = db['posts']
user_tags = db['user_tags']
num = mycol.count()
author_id = {}
id_label = {}
topK = 20
post_tags = {}
#key:author_id;value:likes
# 每一个用户
for x in mycol.find({}, {"author_id":1,"likes":1}):
author = x.get("author_id")
likes = x.get("likes")
author_id[str(author)] = [likes]
label = {} # 每一个用户对应一个label标签库
# 每一篇帖子
for post in likes:
# 判断 post是否已经出现过;如果已经分词过,则直接用库里存的标签;
if post_tags.__contains__(post):
tags = post_tags.get(post)
else:
t = list(posts.find({"_id":ObjectId(post)},{"title":1,"description":1,"media":1}))
if t == []:
continue
else:
t = t[0]
title = t.get("title")
description = t.get("description")
media = t.get("media")
# 正则匹配获得文本数据
try:
title = ''.join(
re.findall(r"[\u4e00-\u9fa5-\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]",
title))
sentence = title + '。'
except:
sentence = ''
for i in range(len(media)):
body = media[i]
body = body.get('body')
if body != None:
body = ''.join(re.findall (r"[\u4e00-\u9fa5-\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]", body))
sentence = sentence + body
try:
description = ''.join(re.findall (r"[\u4e00-\u9fa5-\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]", description))
except:
pass
if description != None:
sentence = sentence + '。'+ description +'!'
# 得到sentence;让sentence去分词提取标签。在标签中的词则生成词表与用户map;
try:
tagsidf = jieba.analyse.extract_tags(sentence, topK=topK, allowPOS=('ns', 's', 'n', 'vn', 'nsf'))
except:
tagsidf = []
# 得到该句子的标签
tags = list(set(tagsidf))
post_tags[post] = tags
# 存入id_label并计数 id:{label1:1,label2:4,labeln:x},label出现的次数代表用户喜欢程度高;
for tag in tags:
if place.__contains__(tag):
if label.__contains__(tag):
label[tag] += 1
else:
label[tag] = 1
else:
pass
id_label[ObjectId(author)] = label
print('_id:',author)
print('label:',label)
# 将帖子存入user_label 存入新的集合中;
for key,values in id_label.items():
dic = {'author_id':ObjectId(key),'tags':value}
print(dic)
user_tags.insert_one(dic)
print('done')
用户标签提取
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...