(python2.7)实现糗百爬虫

先贴代码,文章以后有空再写,注释也以后再写,bug也以后再调,参考文献也以后在贴吧,就这样了(葛优躺)

文件1:main.py

# -*- coding:utf-8 -*-
import urllib
import qsbk

spider = qsbk.QsbkSpider()
spider.section='8hr'
spider.loadSomePages(10)
while True:
    article = spider.getRandomArticle()
    if not article:
        break
    print '[ page',article['pageIndex'],'artical',article['articleIndex'],']\n',\
        '< Article by', article['author'], '>\n', article['text'],'\n< God Comment >\n',\
        article['cmtMan'], article['cmt']
    print 'pause enter to get next article'
    input = raw_input()
    if(input in ['q','Q']):
        break

文件2:qsbk.py

__author__ = 'ssins'
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import random
from bs4 import BeautifulSoup

class QsbkSpider:
    def __init__(self):
        self._pageIndex = 1
        self.maxPageIndex = 35
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self._qsbkUrl = 'http://www.qiushibaike.com/'
        self.section = '8hr'
        self._sections = ['8hr','hot','imgrank','text','history','pic','textnew']

        self._headers = {'User-Agent' : self.user_agent}
        self._stories = []
        self.enable = False

    def getPageUrl(self,section,pageIndex):
        if section not in self._sections or pageIndex < 1 or pageIndex > self.maxPageIndex:
            return None
        url = self._qsbkUrl + section + '/page/' + str(pageIndex)
        return url

    def getPageInfo(self, url):
        try:
            request = urllib2.Request(url, headers = self._headers)
            response = urllib2.urlopen(request)
            html = response.read()
            return html
        except:
            return None

    def find_article_span(self,tag):
        if tag.name != 'span':
            return False
        children = tag.children
        for child in children:
            if (child.name in ['img', 'h2']):
                return False
        return True

    def getPageArticles(self,section,pageIndex):
        pageCode = self.getPageInfo(self.getPageUrl(section,pageIndex))
        if not pageCode:
            return None
        pageCode = str(pageCode)
        soup = BeautifulSoup(pageCode, 'lxml')
        #soup = BeautifulSoup(pageCode, 'html.parser')
        articles = soup.find_all('div', class_='article block untagged mb15')
        articlesDictionaryList = []
        try:
            for tmpArt in articles:
                article = str(tmpArt)
                if re.search('class="thumb"', article):
                    continue
                replaceBr = re.compile('<br/>')
                article = re.sub(replaceBr, "\n", article)
                soupArticle = BeautifulSoup(article, 'lxml')
                #soupArticle = BeautifulSoup(article, 'html.parser')
                author = soupArticle.h2.string
                text = soupArticle.find(self.find_article_span).string
                cmtMan = 'no God Comment'
                cmt = ''
                try:
                    cmtMan = soupArticle.find('span', class_='cmt-name').string
                    cmt = soupArticle.find('div', class_='main-text').string
                except:
                    pass
                articlesDictionary = {}
                articlesDictionary['author'] = author
                articlesDictionary['text'] = text
                articlesDictionary['cmtMan'] = cmtMan
                articlesDictionary['cmt'] = cmt
                articlesDictionaryList.append(articlesDictionary)
            self._stories.append(articlesDictionaryList)
        except:
            return False
        return True

    def loadNextPage(self):
        if(self._pageIndex > self.maxPageIndex):
            return False
        if(self.getPageArticles(self.section,self._pageIndex)):
            self._pageIndex += 1
            return True
        return False

    def loadSomePages(self, pageNums):
        for i in range(pageNums):
            self.loadNextPage();

    def getRandomArticle(self):
        if(len(self._stories)<1):
            return None
        pageIndex = random.randint(0, len(self._stories) - 1)
        articleIndex = random.randint(0, len(self._stories[pageIndex]) - 1)
        article = self._stories[pageIndex][articleIndex]
        article['pageIndex'] = pageIndex + 1
        article['articleIndex'] = articleIndex + 1
        return article
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

  • 黑云又来了! 啊!黑云又来了! 啊!!黑云! 缭绕着尘滓, 弥漫着瘴毒, 向着我的头顶压过来。 我看到, 地狱之门...
    石逸轩阅读 1,675评论 0 2
  • 郑毅听到贾老师在卫生间踢水盆的声音,略一停顿,张了张嘴,可还是把到了嘴边的怒火压了下去。算了,一大清早,不惹气了。...
    樵砥阅读 1,432评论 2 3
  • 那啥综合症又犯了,为什么都不理解,一定要问个为什么呢?
    _独家记忆阅读 2,425评论 0 1
  • 一、 孤独就像人说的那样 最后走的人关门最轻 ——网易云热评 二、 小时候 画在手上的表没有动 却带走了我们最好的...
    一条芒狗阅读 1,477评论 0 3

友情链接更多精彩内容