python 爬小说

学习爬虫，练习一下，环境python 3.6
from urllib import request
import re
from html.parser import HTMLParser
import html2text

home_url = "https://www.77xsw.la/"
title_pattern = re.compile(r'<li class="active">(.*?)</li>')
chapter_pattern = re.compile(r'<a href=[\'\"]([0-9]*?).html[\'\"] title=".*?">')
content_pattern = re.compile(r'<div class="panel-body" id="htmlContent">(.*?)</div>',re.S)
next_page_pattern = re.compile(r'<a id="linkNext" class="btn btn-default" href="([0-9]*?_[0-9]*?).html">下一页</a>')
tag_pattern = re.compile(r'千千小说网 www.77xsw.la，最快更新<a href="https://www.77xsw.la/book/[0-9]*?/">.*?</a>最新章节！<br><br>')
empty_pattern = re.compile(r'\s')

class Spider:
    def __init__(self,book_id):
        self.book_id = book_id

    def do_request(self,url):
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
        }
        req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        return response.read().decode(encoding="gbk")

    def get_chapter_list(self):
        chapter_page_html = self.do_request(home_url + "book/" + str(self.book_id) + "/")
        chapter_list = chapter_pattern.findall(chapter_page_html)
        chapter_list = sorted(set(chapter_list))
        return chapter_list

    def deal_content(self,content):
        return html2text.html2text(content)

    def get_page_content(self,page_id):
        page_url =  home_url + "book/" + str(self.book_id) + "/" + page_id + ".html"
        page_html = self.do_request(page_url)
        content = content_pattern.search(page_html).group(1)
        content = tag_pattern.sub("",content)
        if '-->><p class="text-danger text-center mg0">本章未完，点击下一页继续阅读</p>' in content:
            content = content.replace('-->><p class="text-danger text-center mg0">本章未完，点击下一页继续阅读</p>',"")
            next_page_id = next_page_pattern.search(page_html).group(1)
            content += self.get_page_content(next_page_id)
        return content


    def get_chapter_content(self,content_id):
        chapter_url = home_url + "book/" + str(self.book_id) + "/" + content_id + ".html"
        chapter_html = self.do_request(chapter_url)
        title = title_pattern.search(chapter_html).group(1)
        print(title)
        content = content_pattern.search(chapter_html).group(1)
        content = tag_pattern.sub("",content)
        if '-->><p class="text-danger text-center mg0">本章未完，点击下一页继续阅读</p>' in content:
            content = content.replace('-->><p class="text-danger text-center mg0">本章未完，点击下一页继续阅读</p>',"")
            next_page_id = next_page_pattern.search(chapter_html).group(1)
            content += self.get_page_content(next_page_id)
        content = empty_pattern.sub("",content)
        return (title,self.deal_content(content))

    def get_txt(self):
        chapter_list = self.get_chapter_list()
        for item in chapter_list:
            title,content = spider.get_chapter_content(item)
            with open('books.txt','a+',encoding='utf-8') as f:
                f.write(title +"\n\n")
                f.write(content)

if __name__ == "__main__":
    spider = Spider(20051)
    spider.get_txt()
最后编辑于：2020.09.11 17:35:15