目标
爬取多页小猪短租网租房的价格信息,并筛选价格大于500的房间,保存至数据库mongo
方法
import requests
from bs4 import BeautifulSoup
import pymongo
client = pymongo.MongoClient('localhost', 27017) # 建立本地数据库连接
xiaozhu = client['xiaozhu'] # 其实数据库就是字典,这里的格式是字典调用key
bnb_info = xiaozhu['bnb_info']
#==========<< 单页的抓取程序 >>==========#
url = 'http://bj.xiaozhu.com/search-duanzufang-p20-0/'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('span.result_title')
prices = soup.select('span.result_price > i')
#------ 从下方至第一个print Done这段是编程思考的过程,最后实现多页爬取时,这段必须注销掉------#
for title, price in zip(titles,prices):
data = {
'title': title.get_text(),
'price': price.int(get_text()) # 加 int 整数化结果,才能比较大小进行筛选
}
bnb_info.insert_one(data) # 把生成的data插入bnb_info
print('Done')
#==========<< 设计多页爬取函数 >>==========#
def get_page_within(pages):
for page_num in range(1, pages+1):
wb_data = requests.get('http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(page_num))
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select(' span.result_title ')
prices = soup.select(' span.result_price > i ') # 这里必须保证提取的是数字
for title, price in zip(titles, prices):
data = {
'title': title.get_text(),
'prices': price.int(get_text())
}
bnb_info.insert_one(data) # 获取数据的bnb_info 已经成为了一个包含数据的目标字典了
print('Done')
get_page_within(3) # 先爬取3页数据
# 从数据库中进行筛选
for i in bnb_info:
if i['price'] >= 500:
print (i)
通过上述步骤,即可实现对数据的筛选。