import os
import requests
from lxml import etree
'''
下载网站 www.biquw.com 的小说
'''
class BiquwNovel:
# 初始化
def __init__(self, url):
self.__url = url
# 定制的下载网页并解析的解析器
def __parse(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
}
try:
self.__html = requests.get(url, headers=headers).text
except Exception as e:
print(e)
return False
self.__tree = etree.HTML(self.__html)
# 下载并保存小说到本地,并保存到指定路径
def download(self, file_path):
# 获取网页发生异常,直接结束
detect = self.__parse(self.__url)
if detect is False:
return
# 提取书名
book_name = self.__tree.xpath('//h1/text()')[0]
# 提取 章节链接
chapter_links = self.__tree.xpath('//div[@class="book_list"]/ul//li/a/@href')
# 以书的名字建立存储目录
file_path = os.path.join(file_path, book_name)
if not os.path.exists(file_path):
os.makedirs(file_path)
print('开始下载...{}'.format(book_name.strip()))
# 循环下载所有章节的内容
for link in chapter_links:
detect = self.__parse(self.__url + link)
if detect is False:
continue
chapter_name = self.__tree.xpath('//h1/text()')[0]
content = self.__tree.xpath('//div[@id="htmlContent"]/text()')
# 以章节名字存储已下载的每一章节的内容
self.__process_text(file_path, chapter_name, content)
print(chapter_name.strip() + '...下载完成!')
# return
# 存储下载的章节内容的函数
@staticmethod
def __process_text(file_path, chapter_name, content):
with open(os.path.join(file_path, '{}.txt'.format(chapter_name)), 'w') as f:
for text in content:
text = text.strip()
if text:
f.write(text + '\n\n')
# 根据URL创建一个实例对象
book = BiquwNovel('http://www.biquw.com/book/900/')
# 调用download方法下载并保存小说
book.download('E:\Python\python_work')
QQ图片20180501185605.png