MainInformation.py
函数功能:获取租房页面的信息
import requests
from bs4 import BeautifulSoup
import time
def getMainInformation(url):
html=requests.get(url)
bsHtml=BeautifulSoup(html.text,'lxml')
data={
'title':'hello',
'price':'hello',
'sex':'hello',
'name':'hello',
'photo':'hello',
'add':'hello',
'ownerPhoto':'hello'
}
# 获取标题
midTitle=bsHtml.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
for i in midTitle:
data['title']=i.get_text()
# 获取地址
midAdd=bsHtml.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')
for i in midAdd:
data['add']=i.get_text().strip()
# 获取价格
midPri=bsHtml.select('#pricePart > div.day_l > span')
for i in midPri:
data['price']=i.get_text()
# 获取房东图片
midOwnerPhone=bsHtml.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
for i in midOwnerPhone:
data['ownerPhoto']=i.get('src')
# 获取房东性别
midSex=bsHtml.select('div[class="member_pic"] > div')
for i in midSex:
x=i.get('class')
if x[0]=='member_ico':
data['sex']='male'
else:
data['sex']='female'
# 获取房东名称
midName=bsHtml.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
for i in midName:
data['name']=i.get_text()
# 获取房间照片
midPh = bsHtml.select('#detailImageBox > div.pho_show_r > div > ul > li > img[data-width="800"]')
for i in midPh:
data['photo'] = i.get('data-src')
break
time.sleep(1)
return data
# 测试程序
url='http://bj.xiaozhu.com/fangzi/1466098635.html'
getMainInformation(url)
getPageHref.py
函数功能:获取租房链接
import requests
from bs4 import BeautifulSoup
import time
def getPageHref(url):
html=requests.get(url)
bsHtml=BeautifulSoup(html.text,'lxml')
# 获取链接
hrefs=bsHtml.select('#page_list > ul > li > a[class="resule_img_a"]')
self_hrefs=[]
for i in hrefs:
self_hrefs.append(i.get('href'))
time.sleep(1)
# 返回获取的链接
return self_hrefs
# 测试程序
url='http://bj.xiaozhu.com/search-duanzufang-p1-0/'
getPageHref(url)
Main.py
函数功能:筛选出房价大于400的租房信息
import getPageHref
import MainInformation
import pymongo
def main():
urls=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,3)]
client=pymongo.MongoClient('localhost',27017)
houseMess=client['houseMess']
sheet_tab=houseMess['sheet_tab']
# for url in urls:
# urlss=getPageHref.getPageHref(url)
# for i in urlss:
# data=MainInformation.getMainInformation(i)
# sheet_tab.insert_one(data)
# 筛选出大于400的商家
house=[]
for i in sheet_tab.find():
if eval(i['price'])>=400:
house.append(i)
for i in house:
print(i)
main()