python词频统计并生成词云

.1.看效果

image.png

2. 看代码

github地址:StatWordOfPoem

步骤:

  • 1.协程爬取诗词网站获取诗词内容
  • 2.分词
  • 3.生成词云
  • 4.用法 eg: python main.py 苏轼
    main.py内容:
# coding=utf8
import requests
from sys import argv
from bs4 import BeautifulSoup
import re, time
import aiohttp
import asyncio
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import jieba
async def get_content(url):
        # async with是异步上下文管理器
        async with aiohttp.ClientSession() as session:  # 获取session
            async with session.request('GET', url) as resp:  # 提出请求
                if resp.status != 200:
                    return ''
                return await resp.read()

async def doTask(domain, url, f):
    print(url)
    async with aiohttp.ClientSession() as session:  # 获取session
        async with session.request('GET', url) as resp:  # 提出请求
            if resp.status != 200:
                html = ''
            else:
                html = await resp.read()
    if html != '':
        soup = BeautifulSoup(html, 'html.parser')
        for h in soup.select('h3>a'):
            url = ''.join([domain, h.get('href')])
            async with aiohttp.ClientSession() as session:  # 获取session
                async with session.request('GET', url) as resp:  # 提出请求
                    if resp.status != 200:
                        phtml = ''
                    else:
                        phtml = await resp.read()
            if phtml == '':
                continue
            soup = BeautifulSoup(phtml, 'html.parser')
            # title = str(soup.select('.shici-title')[0].get_text())
            poem = soup.select('.shici-content>.para')
            if len(poem) == 0:
                poem = str(soup.select('.shici-content')[0].get_text())
            else:
                poem = str(soup.select('.shici-content>.para')[0].get_text())
            segList = jieba.cut(poem, cut_all=True)
            f.write(' '.join(segList))
def main():
    domain = 'http://www.shicimingju.com'
    authorUrl = domain + '/chaxun/zuozhe_list/'
    if len(argv) < 2:
        print('Please input an author name!')
        exit()

    print(authorUrl + argv[1])
    response = requests.get(authorUrl + argv[1])
    soup = BeautifulSoup(response.content, 'html.parser')
    firstUrl = soup.select('h3>a')
    if len(firstUrl) == 0:
        print('The author does\'t exist!')
        exit()
    poemUrl = firstUrl[0].get('href')
    print(poemUrl)
    poemNum = re.findall('\d+', poemUrl)[0]
    print(poemNum)
    page = 1
    pageUrls = []
    while page < 88:
        pageUrl = ''.join([domain, poemUrl]) if (page == 1) else \
            ''.join([domain, poemUrl]).replace(poemNum, poemNum + '_' + str(page))
        pageUrls.append(pageUrl)
        page += 1
    with open('poem.txt', 'a+', encoding='utf-8') as f:
        f.seek(0)
        f.truncate()  # 清空文件
        loop = asyncio.get_event_loop()           # 获取事件循环
        tasks = [doTask(domain, url, f) for url in pageUrls]  # 把所有任务放到一个列表中
        loop.run_until_complete(asyncio.wait(tasks)) # 激活协程
        loop.close()  # 关闭事件循环

def draw():
    # 画图
    # 读文件
    text = open('poem.txt', encoding='utf-8').read()
    # 读图片
    alice_mask = np.array(Image.open("alice_mask.png"))

    stopwords = set(STOPWORDS)
    stopwords.add("said")

    wc = WordCloud(background_color="white", font_path='fzjl.ttf', max_words=2000, mask=alice_mask,
                   stopwords=stopwords, contour_width=3, contour_color='steelblue')

    # 生成云
    wc.generate(text)

    # 保存
    wc.to_file(argv[1] + ".png")

    # 展示图片
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()

if __name__ == '__main__':
    start = time.time()
    main()  # 调用方
    draw()
    print('总耗时:%.5f秒' % float(time.time()-start))
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

  • # Python 资源大全中文版 我想很多程序员应该记得 GitHub 上有一个 Awesome - XXX 系列...
    小迈克阅读 3,130评论 1 3
  • 一、Python简介和环境搭建以及pip的安装 4课时实验课主要内容 【Python简介】: Python 是一个...
    _小老虎_阅读 6,356评论 0 10
  • Python语言特性 1 Python的函数参数传递 看两个如下例子,分析运行结果: 代码一: a = 1 def...
    伊森H阅读 3,184评论 0 15
  • 认识我的几乎都知道我家里有一只兔子,姓孙名坨坨,随着时间流逝,在他每天作天作地,撕纸啃电线的情况下,现在改名了,人...
    sour那个丹阅读 378评论 0 0
  • 现在长大了,奶奶经常跟我说的一句话就是找对象要好好找,最好是男方要长得好,家里父母比较年轻,有车有房的等等,有的时...
    温柔11阅读 267评论 0 0

友情链接更多精彩内容