爬取中华诗词网所有诗词

今天给大家分享一个爬取中华诗词网所有诗词的python爬虫,本项目主要使用了pyquery库和request库(其实只用pyquery就可实现)废话不多说,直接上代码

from time import sleep

import requests
from pyquery import PyQuery as pq

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
base_url = "https://www.shi-ci.com"
def get_index_page(url):
    urls = []
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        doc = pq(response.text)
        results = doc("li")
        # print(results)
        for result in results.items():
            result = str(result.children().attr("href"))
            if len(result)>5:
                urls.append(base_url+"/"+result)
        return urls
    return None

def get_author_page(url):
    urls = []
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        doc = pq(response.text)
        results = doc(".poem-preview")
        for result in results.items():
            result = str(result.children().attr("href"))
            if len(result) > 5:
                urls.append(base_url + "/" + result)
        return urls
    return None

def get_poem_list(url):
    urls = []
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        doc = pq(response.text)
        results = doc(".poem-preview")
        for result in results.items():
            result = str(result.children().attr("href"))
            if len(result) > 5:
                urls.append(base_url + "/" + result)
        return urls
    return None

# def get_poem_page(url):
#     response = requests.get(url, headers=headers)
#     if response.status_code == 200:
#         doc = pq(response.text)
#         title = doc("#poem>h1").text()
#         year = doc("#poem>h3").text()
#         content = doc("#poem>div").text()
#         try:
#             with open("poems.csv","a",encoding="utf-8") as csvf:
#                 filenames = ["title","year","content"]
#                 writer = csv.DictWriter(csvf,filenames,delimiter=" ")
#                 writer.writerow({"title": title,"year": year,"content": content})
#                 print("%ssuccess"%title)
#         except ValueError as e:
#             print(e)
#     return None
def get_poem_page(url):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        doc = pq(response.text)
        title = doc("#poem>h1").text()
        year = doc("#poem>h3").text()
        content = doc("#poem>div").text()
        with open("poems.txt","a",encoding="utf-8") as f:
            result = title+"\n"+year+"\n"+content+"\n-------------------\n"
            f.write(result)
            print("%ssuccess" % title)
    return None
if __name__ == "__main__":
    # urls = get_index_page(base_url)[:2]
    # for url in urls:
    #     author_urls = get_author_page(url)[:2]
    #     for author_url in author_urls:
    #         poem_urls = get_poem_list(author_url)
    #         for poem_url in poem_urls:
    #             # sleep(0.5)
    #             get_poem_page(poem_url)
    urls = get_index_page(base_url)
    for url in urls:
        author_urls = get_author_page(url)
        for author_url in author_urls:
            sleep(0.5)
            poem_urls = get_poem_list(author_url)
            for poem_url in poem_urls:
                get_poem_page(poem_url)

喜欢就留下你的小💗

©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

  • 用两张图告诉你,为什么你的 App 会卡顿? - Android - 掘金 Cover 有什么料? 从这篇文章中你...
    hw1212阅读 13,633评论 2 59
  • Android 自定义View的各种姿势1 Activity的显示之ViewRootImpl详解 Activity...
    passiontim阅读 177,671评论 25 709
  • # Python 资源大全中文版 我想很多程序员应该记得 GitHub 上有一个 Awesome - XXX 系列...
    小迈克阅读 3,109评论 1 3
  • 病态的躯体 拖着旧皮囊 黑夜中游走 灵魂裸露在街巷 我的译本通过了吗 我的小说可以发表吗 下个月的生计能否维持呢 ...
    木易不易丁阅读 251评论 0 2
  • 文/ A 幸运点 纶巾羽扇卧龙吟,天下三分西蜀寻。 妙计安邦才博谷,奇门献主术通今。 七擒孟获南征事,六出祁山北伐...
    A幸运点阅读 564评论 22 17

友情链接更多精彩内容