生成词云需要使用wordcloud库,
可以通过以下步骤对文件夹内多个txt格式文件分别进行生成词云
import os
from wordcloud import WordCloud
import imageio # 图像模块
img = imageio.imread('D:/cat.png')
folder_path = 'G:/lyric/test'
output_path = 'G:/lyric/test/testpic'
for file_name in os.listdir(folder_path):
if file_name.endswith('.txt'):
file_path = os.path.join(folder_path, file_name)
with open(file_path, 'r') as f:
text = f.read()
wc = WordCloud(font_path='msyh.ttc',
background_color='white',mask=img,
max_words=2000)
wc.generate(text)
output_file = os.path.join(output_path, file_name.replace('.txt', '.png'))
wc.to_file(output_file)
存在问题:长句统计在了词云中
解决方式:引入中文分词库jieba,使用cn_stopwords.txt 停用词文本更改设置词云图
import jieba # 中文分词库
# 使用jieba进行分词
words = " ".join(jieba.cut(text))
import os
from wordcloud import WordCloud
import imageio # 图像模块
import jieba # 中文分词库
img = imageio.imread('D:/cat.png')
folder_path = 'G:/lyric/test'
output_path = 'G:/lyric/test/testpic'
for file_name in os.listdir(folder_path):
if file_name.endswith('.txt'):
file_path = os.path.join(folder_path, file_name)
with open(file_path, 'r') as f:
text = f.read()
txt_list = jieba.lcut(text)
string = ' '.join(txt_list) # 使用jieba进行分词words = " ".join(jieba.cut(text))
wc = WordCloud(font_path='msyh.ttc',
background_color='white',mask=img,
max_words=2000,
stopwords=set([line.strip() for line in open('cn_stopwords.txt', mode='r',
encoding='utf-8').readlines()])
)
wc.generate(string)
output_file = os.path.join(output_path, file_name.replace('.txt', '.png'))
wc.to_file(output_file)
修改后效果优于之前,长句不再出现在词云中