用户标签提取

# 项目思路是根据用户点赞过的帖子提取文本标签（主要是地理位置）。根据tags进行匹配帖子用作推荐。
# -*- coding:utf-8 -*-
# 此脚本获得User_Likes_Label；获得用户点赞帖子的地理标签

from pymongo import MongoClient
from bson import ObjectId
import jieba.analyse
import json
import re

# 读取标签库加入list

place = {}
file = open('place.txt',encoding = 'utf-8')
for line in file.readlines():
    line = line.strip()
    place[line] = 1

# 设置idf词库
jieba.analyse.set_idf_path('idfall.txt')
# 连接mogno数据库
host = '47.94.14.59'
client = MongoClient('47.94.14.59', 27017)
print(client)
db = client['aituwen']
mycol = db['user_likes']
posts = db['posts']
user_tags = db['user_tags']
num = mycol.count()
author_id = {}
id_label = {}
topK = 20
post_tags = {}
#key:author_id;value:likes
# 每一个用户
for x in mycol.find({}, {"author_id":1,"likes":1}):
    author = x.get("author_id")
    likes = x.get("likes")
    author_id[str(author)] = [likes]
    label = {} # 每一个用户对应一个label标签库
    # 每一篇帖子
    for post in likes:
        # 判断 post是否已经出现过；如果已经分词过，则直接用库里存的标签；
        if post_tags.__contains__(post):
            tags = post_tags.get(post)
        else:
            t = list(posts.find({"_id":ObjectId(post)},{"title":1,"description":1,"media":1}))
            if t == []:
                continue
            else:
                t = t[0]
            title = t.get("title")
            description = t.get("description")
            media = t.get("media")
            # 正则匹配获得文本数据
            try:
                title = ''.join(
                    re.findall(r"[\u4e00-\u9fa5-\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]",
                               title))
                sentence = title + '。'
            except:
                sentence = ''
            for i in range(len(media)):
                body = media[i]
                body = body.get('body')
                if body != None:
                    body = ''.join(re.findall (r"[\u4e00-\u9fa5-\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]", body))
                    sentence = sentence + body
            try:
                description = ''.join(re.findall (r"[\u4e00-\u9fa5-\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]", description))
            except:
                pass

            if description != None:
                sentence = sentence + '。'+ description +'!'
            # 得到sentence；让sentence去分词提取标签。在标签中的词则生成词表与用户map；
            try:
                tagsidf = jieba.analyse.extract_tags(sentence, topK=topK, allowPOS=('ns', 's', 'n', 'vn', 'nsf'))
            except:
                tagsidf = []
                # 得到该句子的标签
            tags = list(set(tagsidf))
            post_tags[post] = tags

        # 存入id_label并计数 id:{label1:1,label2:4,labeln:x}，label出现的次数代表用户喜欢程度高；
        for tag in tags:
            if place.__contains__(tag):
                if label.__contains__(tag):
                    label[tag] += 1
                else:
                    label[tag] = 1
            else:
                pass

    id_label[ObjectId(author)] = label
    print('_id:',author)
    print('label:',label)
    # 将帖子存入user_label 存入新的集合中；
for key,values in id_label.items():
    dic = {'author_id':ObjectId(key),'tags':value}
    print(dic)
    user_tags.insert_one(dic)

print('done')
用户标签提取

友情链接更多精彩内容