首先导入jieba
#pip install jieba
import jieba
测试jieba
# txt='我来到北京清华大学'
# seg_list=jieba.lcut(txt)
# print(seg_list)
['我', '来到', '北京', '清华大学']
加载三国文本‘threekingdom.txt’
分词,并存在列表里
with open('threekingdom.txt','r',encoding='utf-8') as f:
txt=f.read()
#将字符串分割成等量的中文
words=jieba.lcut(txt)
#print(words)
统计词频,去除长度为1的词
counts={}
for word in words:
if len(word)==1:
continue
else:
#往字典里增加元素
# counts['key']=888
counts[word]=counts.get(word,0)+1
# counts['曹操'] = counts.get('曹操', 0) + 1
#print(counts)
合并名字相同的词,词频相加
#合并相同词
counts['孔明']=counts.get('孔明')+counts.get('孔明曰')
counts['玄德'] = counts.get('玄德') + counts.get('玄德曰')
counts['玄德'] = counts.get('玄德') + counts.get('刘备')
counts['关公'] = counts.get('关公') + counts.get('云长')
定义不是名字的高频词集合,并从集合中删去
excludes={"将军", "却说", "丞相", "二人", "不可", "荆州", "不能", "如此", "商议",
"如何", "主公", "军士", "军马", "左右", "次日", "引兵", "大喜", "天下",
"东吴", "于是", "今日", "不敢", "魏兵", "陛下", "都督", "人马", "不知",
'孔明曰','玄德曰','云长','刘备'
}
#删除无关词
for word in excludes:
del counts[word]
两种方法统计词频出现次数最多的前20个词
#统计出现频次最多的前20个词
items=list(counts.items())
#print('排序前:',items)
items.sort(key=lambda x:x[1],reverse=True)
#print('排序后:',items)
for i in range(10):
character,count=items[i]
print(character,count)
#统计出现词频最高的20个词2
# roles=Counter(counts)
# role=roles.most_common(10)
# print(role)
构造词云字符串
首先需要导入词云WordCloud
li=[]
for i in range(10):
character, count = items[i]
for _ in range(count):
li.append(character)
# print(li)
cloud_txt=",".join(li)
wc=WordCloud(
background_color='white',
font_path='msyh.ttc',
#是否包含两个词的搭配,默认是True
collocations=False
).generate(cloud_txt)
wc.to_file('三国词云.png')
最后运行函数
全部代码:
#pip install jieba
import jieba
from collections import Counter
from wordcloud import WordCloud
# txt='我来到北京清华大学'
# seg_list=jieba.lcut(txt)
# print(seg_list)
def parse():
"""三国小说人物出场词频统计"""
#定义无关词集合
excludes={"将军", "却说", "丞相", "二人", "不可", "荆州", "不能", "如此", "商议",
"如何", "主公", "军士", "军马", "左右", "次日", "引兵", "大喜", "天下",
"东吴", "于是", "今日", "不敢", "魏兵", "陛下", "都督", "人马", "不知",
'孔明曰','玄德曰','云长','刘备'
}
with open('threekingdom.txt','r',encoding='utf-8') as f:
txt=f.read()
#将字符串分割成等量的中文
words=jieba.lcut(txt)
#print(words)
#'曹操':555
counts={}
for word in words:
if len(word)==1:
continue
else:
#往字典里增加元素
# counts['key']=888
counts[word]=counts.get(word,0)+1
# counts['曹操'] = counts.get('曹操', 0) + 1
#print(counts)
#合并相同词
counts['孔明']=counts.get('孔明')+counts.get('孔明曰')
counts['玄德'] = counts.get('玄德') + counts.get('玄德曰')
counts['玄德'] = counts.get('玄德') + counts.get('刘备')
counts['关公'] = counts.get('关公') + counts.get('云长')
#删除无关词
for word in excludes:
del counts[word]
#统计出现频次最多的前20个词
items=list(counts.items())
#print('排序前:',items)
items.sort(key=lambda x:x[1],reverse=True)
#print('排序后:',items)
for i in range(10):
character,count=items[i]
print(character,count)
#统计出现词频最高的20个词2
# roles=Counter(counts)
# role=roles.most_common(10)
# print(role)
##生成词云
#构造词云字符串
li=[]
for i in range(10):
character, count = items[i]
for _ in range(count):
li.append(character)
# print(li)
cloud_txt=",".join(li)
wc=WordCloud(
background_color='white',
font_path='msyh.ttc',
#是否包含两个词的搭配,默认是True
collocations=False
).generate(cloud_txt)
wc.to_file('三国词云.png')
parse()