今天给大家分享一个爬取中华诗词网所有诗词的python爬虫,本项目主要使用了pyquery库和request库(其实只用pyquery就可实现)废话不多说,直接上代码
from time import sleep
import requests
from pyquery import PyQuery as pq
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
base_url = "https://www.shi-ci.com"
def get_index_page(url):
urls = []
response = requests.get(url, headers=headers)
if response.status_code == 200:
doc = pq(response.text)
results = doc("li")
# print(results)
for result in results.items():
result = str(result.children().attr("href"))
if len(result)>5:
urls.append(base_url+"/"+result)
return urls
return None
def get_author_page(url):
urls = []
response = requests.get(url, headers=headers)
if response.status_code == 200:
doc = pq(response.text)
results = doc(".poem-preview")
for result in results.items():
result = str(result.children().attr("href"))
if len(result) > 5:
urls.append(base_url + "/" + result)
return urls
return None
def get_poem_list(url):
urls = []
response = requests.get(url, headers=headers)
if response.status_code == 200:
doc = pq(response.text)
results = doc(".poem-preview")
for result in results.items():
result = str(result.children().attr("href"))
if len(result) > 5:
urls.append(base_url + "/" + result)
return urls
return None
# def get_poem_page(url):
# response = requests.get(url, headers=headers)
# if response.status_code == 200:
# doc = pq(response.text)
# title = doc("#poem>h1").text()
# year = doc("#poem>h3").text()
# content = doc("#poem>div").text()
# try:
# with open("poems.csv","a",encoding="utf-8") as csvf:
# filenames = ["title","year","content"]
# writer = csv.DictWriter(csvf,filenames,delimiter=" ")
# writer.writerow({"title": title,"year": year,"content": content})
# print("%ssuccess"%title)
# except ValueError as e:
# print(e)
# return None
def get_poem_page(url):
response = requests.get(url, headers=headers)
if response.status_code == 200:
doc = pq(response.text)
title = doc("#poem>h1").text()
year = doc("#poem>h3").text()
content = doc("#poem>div").text()
with open("poems.txt","a",encoding="utf-8") as f:
result = title+"\n"+year+"\n"+content+"\n-------------------\n"
f.write(result)
print("%ssuccess" % title)
return None
if __name__ == "__main__":
# urls = get_index_page(base_url)[:2]
# for url in urls:
# author_urls = get_author_page(url)[:2]
# for author_url in author_urls:
# poem_urls = get_poem_list(author_url)
# for poem_url in poem_urls:
# # sleep(0.5)
# get_poem_page(poem_url)
urls = get_index_page(base_url)
for url in urls:
author_urls = get_author_page(url)
for author_url in author_urls:
sleep(0.5)
poem_urls = get_poem_list(author_url)
for poem_url in poem_urls:
get_poem_page(poem_url)