先贴代码,文章以后有空再写,注释也以后再写,bug也以后再调,参考文献也以后在贴吧,就这样了(葛优躺)
文件1:main.py
# -*- coding:utf-8 -*-
import urllib
import qsbk
spider = qsbk.QsbkSpider()
spider.section='8hr'
spider.loadSomePages(10)
while True:
article = spider.getRandomArticle()
if not article:
break
print '[ page',article['pageIndex'],'artical',article['articleIndex'],']\n',\
'< Article by', article['author'], '>\n', article['text'],'\n< God Comment >\n',\
article['cmtMan'], article['cmt']
print 'pause enter to get next article'
input = raw_input()
if(input in ['q','Q']):
break
文件2:qsbk.py
__author__ = 'ssins'
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import random
from bs4 import BeautifulSoup
class QsbkSpider:
def __init__(self):
self._pageIndex = 1
self.maxPageIndex = 35
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self._qsbkUrl = 'http://www.qiushibaike.com/'
self.section = '8hr'
self._sections = ['8hr','hot','imgrank','text','history','pic','textnew']
self._headers = {'User-Agent' : self.user_agent}
self._stories = []
self.enable = False
def getPageUrl(self,section,pageIndex):
if section not in self._sections or pageIndex < 1 or pageIndex > self.maxPageIndex:
return None
url = self._qsbkUrl + section + '/page/' + str(pageIndex)
return url
def getPageInfo(self, url):
try:
request = urllib2.Request(url, headers = self._headers)
response = urllib2.urlopen(request)
html = response.read()
return html
except:
return None
def find_article_span(self,tag):
if tag.name != 'span':
return False
children = tag.children
for child in children:
if (child.name in ['img', 'h2']):
return False
return True
def getPageArticles(self,section,pageIndex):
pageCode = self.getPageInfo(self.getPageUrl(section,pageIndex))
if not pageCode:
return None
pageCode = str(pageCode)
soup = BeautifulSoup(pageCode, 'lxml')
#soup = BeautifulSoup(pageCode, 'html.parser')
articles = soup.find_all('div', class_='article block untagged mb15')
articlesDictionaryList = []
try:
for tmpArt in articles:
article = str(tmpArt)
if re.search('class="thumb"', article):
continue
replaceBr = re.compile('<br/>')
article = re.sub(replaceBr, "\n", article)
soupArticle = BeautifulSoup(article, 'lxml')
#soupArticle = BeautifulSoup(article, 'html.parser')
author = soupArticle.h2.string
text = soupArticle.find(self.find_article_span).string
cmtMan = 'no God Comment'
cmt = ''
try:
cmtMan = soupArticle.find('span', class_='cmt-name').string
cmt = soupArticle.find('div', class_='main-text').string
except:
pass
articlesDictionary = {}
articlesDictionary['author'] = author
articlesDictionary['text'] = text
articlesDictionary['cmtMan'] = cmtMan
articlesDictionary['cmt'] = cmt
articlesDictionaryList.append(articlesDictionary)
self._stories.append(articlesDictionaryList)
except:
return False
return True
def loadNextPage(self):
if(self._pageIndex > self.maxPageIndex):
return False
if(self.getPageArticles(self.section,self._pageIndex)):
self._pageIndex += 1
return True
return False
def loadSomePages(self, pageNums):
for i in range(pageNums):
self.loadNextPage();
def getRandomArticle(self):
if(len(self._stories)<1):
return None
pageIndex = random.randint(0, len(self._stories) - 1)
articleIndex = random.randint(0, len(self._stories[pageIndex]) - 1)
article = self._stories[pageIndex][articleIndex]
article['pageIndex'] = pageIndex + 1
article['articleIndex'] = articleIndex + 1
return article