抓取糗事百科的笑话哈哈
#面向过程版本
import requests
import re
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
r=requests.get(url,timeout=30,headers=headers)
r.raise_for_status()
r.encoding='utf-8'
return r.text
except:
raise
def parseHTML(ilt,html):
soup=BeautifulSoup(html,'html.parser')
articleDiv=soup.find_all('div',attrs={'class':'article block untagged mb15'})
for item in articleDiv:
try:
title=item.h2.string
article=item.span.get_text()
joy=item.find('span',attrs={'class':'stats-vote'}).get_text()
comment=item.find('a',attrs={'class':'qiushi_comments'}).get_text()
ilt.append([title,article,joy,comment])
except:
continue
def printArticle(ilt):
pattern=u'{0}\n{1}\n{2}{3}\n\n' #将编码设置为Unicode编码,好像很好使
for every in ilt:
print(pattern.format(every[0],every[1],every[2],every[3]))
with open('joke.txt','a',encoding='utf-8') as f:
f.write(every[1]+'\n'*2)
def main():
start_url='http://www.qiushibaike.com/hot/page/'
pageNumber=1
url=start_url+str(pageNumber)
html=getHTMLText(url)
#print(html)
ilt=[]
parseHTML(ilt,html)
printArticle(ilt)
main()
#用类和实例来做
import re
import requests
from bs4 import BeautifulSoup
class QSBK(object):
def __init__(self):
self.pageNumber=2
self.start_url='http://www.qiushibaike.com/hot/page/'
self.url=self.start_url+str(self.pageNumber)
self.ilt=[]
self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
def getHTMLText(self):
try:
r=requests.get(self.url,timeout=20,headers=self.headers)
r.raise_for_status()
r.encoding='utf-8'
return r.text
except:
return 'ERROR'
def parsePage(self):
soup=BeautifulSoup(self.getHTMLText(),'html.parser')
articleDiv=soup.find_all('div',attrs={'class':'article block untagged mb15'})
for item in articleDiv:
try:
title=item.h2.string
article=item.span.get_text()
joy=item.find('span',attrs={'class':'stats-vote'}).get_text()
comment=item.find('a',attrs={'class':'qiushi_comments'}).get_text()
self.ilt.append([title,article,joy,comment])
except:
continue
def printJoke(self):
pattern=u'{0}\n{1}\n{2}{3}\n\n' #将编码设置为Unicode编码,好像很好使
for every in self.ilt:
print(pattern.format(every[0],every[1],every[2],every[3]))
with open('joke.txt','a',encoding='utf-8') as f:
f.write(every[1]+'\n'*2)
spider=QSBK()
spider.getHTMLText()
spider.parsePage()
spider.printJoke()
python 爬取糗事百科
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 利用python写一个爬虫,爬取百度百科的某一个词条下面的全部链接和每一个链接内部的词条主题和摘要。利用reque...