使用了requests,下载的笔趣阁的小说
# -*- coding: utf-8 -*-
import requests
import re
import time
import os
import requests.packages.urllib3.util.ssl_
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'
def get_html(url):
print("get_html")
header ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',}
# 使用request库传参时不需要使用urlencode函数去编码成byte类型数据,库自动去编码。get方法参数是params=,post方法是data=
#response = requests.get(url,headers = header,verify=False)
response = requests.get(url)
response.encoding=response.apparent_encoding
html=response.text
return html
def novel_information(html,reg):
print("novel_information")
novel_text=re.findall(reg,html)
#print(novel_text)
return novel_text
def get_regular_text(text):
global count
print("------get rgulax text----")
#print(type(text[0]),'length=',len(text))
new_word=text
if ' ' in text[0]:
new_word=re.sub(r' ',' ',text[0])
print(type(new_word),'length=',len(new_word))
if '<br />' in new_word:
new_word=re.sub(r'<br />','\n',new_word)
count+=len(new_word)
return new_word
def get_onechapter(url):
print("------get_onechapter----")
html = get_html(url)
re1 = r'<h1>(.*?)</h1>'
re2 = r'<div id="content">(.*?)</div>'
novel_chapter_name = novel_information(html, re1)
text = novel_information(html, re2)
print("下载:------",count)
print(novel_chapter_name)
chapt_text=" "+novel_chapter_name[0]+'\n'+get_regular_text(text)
return chapt_text
def download_onechapter(words):
#下面是文件存目录,修改你的路径
print("------download_onechapter----")
path="F://暗黑系暖婚.txt"
with open(path,'a+',encoding="utf-8") as f:
for word in words:
f.write(word)
f.write('\n')
def chapter_all(url,re3,host):
print("------chapter_all----")
html=get_html(url)
#print(html)
chapter_list=novel_information(html,re3)
print("length=",len(chapter_list),type(chapter_list))
host1 = "https://www.biquge.biz"
chapters=[host1+ i for i in chapter_list]
chapters=[i+'.html' for i in chapters]
print(chapters)
#print(chapters.index("https://www.xxbiquge.com/68_68479/4010677.html"))
return chapters
def download_novel(url_all):
print("------download_novel----")
#for index in range(472,len(url_all)):
for chapt in url_all:
try:
words=get_onechapter(chapt)
download_onechapter(words)
except Exception:
continue
finally:
print("GOTO ANOTHER CHAPTER")
count=0;
def main():
print("------main----")
host="https://www.xxbiquge.com"
#下面是小说目录界面,可以打开参考一下,修改成你需要的.
url0="https://www.biquge.biz/22_22126/"
re3=r'<dd><a href="(.*?).html"'
start=time.clock();
try:
all=chapter_all(url0,re3,host)
download_novel(all)
except Exception as e:
print("has error"+str(e))
finally:
end=time.clock()
print("总计用时:%.2f s"%(end-start))
main()