学习爬虫,练习一下,环境python 3.6
from urllib import request
import re
from html.parser import HTMLParser
import html2text
home_url = "https://www.77xsw.la/"
title_pattern = re.compile(r'<li class="active">(.*?)</li>')
chapter_pattern = re.compile(r'<a href=[\'\"]([0-9]*?).html[\'\"] title=".*?">')
content_pattern = re.compile(r'<div class="panel-body" id="htmlContent">(.*?)</div>',re.S)
next_page_pattern = re.compile(r'<a id="linkNext" class="btn btn-default" href="([0-9]*?_[0-9]*?).html">下一页</a>')
tag_pattern = re.compile(r'千千小说网 www.77xsw.la,最快更新<a href="https://www.77xsw.la/book/[0-9]*?/">.*?</a>最新章节!<br><br>')
empty_pattern = re.compile(r'\s')
class Spider:
def __init__(self,book_id):
self.book_id = book_id
def do_request(self,url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
}
req = request.Request(url,headers=headers)
response = request.urlopen(req)
return response.read().decode(encoding="gbk")
def get_chapter_list(self):
chapter_page_html = self.do_request(home_url + "book/" + str(self.book_id) + "/")
chapter_list = chapter_pattern.findall(chapter_page_html)
chapter_list = sorted(set(chapter_list))
return chapter_list
def deal_content(self,content):
return html2text.html2text(content)
def get_page_content(self,page_id):
page_url = home_url + "book/" + str(self.book_id) + "/" + page_id + ".html"
page_html = self.do_request(page_url)
content = content_pattern.search(page_html).group(1)
content = tag_pattern.sub("",content)
if '-->><p class="text-danger text-center mg0">本章未完,点击下一页继续阅读</p>' in content:
content = content.replace('-->><p class="text-danger text-center mg0">本章未完,点击下一页继续阅读</p>',"")
next_page_id = next_page_pattern.search(page_html).group(1)
content += self.get_page_content(next_page_id)
return content
def get_chapter_content(self,content_id):
chapter_url = home_url + "book/" + str(self.book_id) + "/" + content_id + ".html"
chapter_html = self.do_request(chapter_url)
title = title_pattern.search(chapter_html).group(1)
print(title)
content = content_pattern.search(chapter_html).group(1)
content = tag_pattern.sub("",content)
if '-->><p class="text-danger text-center mg0">本章未完,点击下一页继续阅读</p>' in content:
content = content.replace('-->><p class="text-danger text-center mg0">本章未完,点击下一页继续阅读</p>',"")
next_page_id = next_page_pattern.search(chapter_html).group(1)
content += self.get_page_content(next_page_id)
content = empty_pattern.sub("",content)
return (title,self.deal_content(content))
def get_txt(self):
chapter_list = self.get_chapter_list()
for item in chapter_list:
title,content = spider.get_chapter_content(item)
with open('books.txt','a+',encoding='utf-8') as f:
f.write(title +"\n\n")
f.write(content)
if __name__ == "__main__":
spider = Spider(20051)
spider.get_txt()