学习爬虫已有半个月了,感觉这种网页的爬取没有太大的技术含量。
现在的学习卡在了多进程这块,另外爬大数据量的时候总是会出这样那样的BUG,很是头疼,代理用的也不是很顺,继续学吧!
__author__ = 'Kalvin.Tse'
from bs4 import BeautifulSoup
import requests
import pymongo
import time
client = pymongo.MongoClient('localhost', 27017)
baike = client['baike']
joke_info = baike['joke_info']
def get_joke(pages):
for i in range(1,pages):
url = 'http://www.qiushibaike.com/8hr/page/{}'.format(str(i))
wb_data = requests.get(url)
time.sleep(1)
print('正在解析第' + str(i) + '页')
print('--'*50)
if wb_data.status_code == 200:
analyse = BeautifulSoup(wb_data.text, 'lxml')
names = analyse.select('div.author.clearfix h2')
contents = analyse.select('div.content')
likes = analyse.select('div.stats span.stats-vote i.number')
for name,content,like in zip(names,contents,likes):
data = {
'用户名': name.get_text(),
'内容': content.get_text().strip(),
'喜欢人数': like.get_text()
}
print(data)
joke_info.insert_one(data)
else:
pass
get_joke(50) #调用函数,爬前50页,有的页码没有内容的直接pass