import requests
from lxml import etree
class NeZha():
def __init__(self):
self.url = 'http://www.quanshuwan.com/book/702.aspx'
self.url_tou = 'http://www.quanshuwan.com'
self.header = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
'Referer': 'https: // movie.douban.com / subject / 26794435 / reviews?start = 480'
}
def get_url(self, reponse):
html = etree.HTML(reponse.text)
urls = html.xpath('//div[@id = "readlist"]//li/a/@href')
link = []
for i in urls:
url = self.url_tou + i
link.append(url)
return link
def parse_url(self, url):
response = requests.get(url)
html = etree.HTML(response.text)
title = html.xpath('//div[@id="content"]/h1/text()')[0]
contents = html.xpath('//div[@id="content"]/p/text()')
with open(r'.\小说\%s.txt' % title, 'w', encoding='utf-8') as f:
f.write(title)
f.write('\n')
for i in contents:
f.write(" " + i)
f.write("\n")
f.close()
def spider(self):
url = self.url
response = requests.get(url, headers=self.header)
links = self.get_url(response)
for i, link in enumerate(links):
self.parse_url(link)
print("\r第{}章下载完成,共{}章,剩余{:.2%}".format(i + 1, len(links), 1 - ((i + 1) / len(links))), end='', flush=True)
if __name__ == '__main__':
start = NeZha()
start.spider()
电子书下载器
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。