getHref.py
函数功能:获取所有的租房链接
from getMainPageInformation import *
from bs4 import BeautifulSoup
import requests
url1='http://bj.xiaozhu.com/'
urls=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(2,20,1)]
sourceData=[]
hreflist=MainPageInformation(url1)
for i in urls:
MainPageInformation(i)
getMainPageInformation.py
函数功能:获取租房链接
from bs4 import BeautifulSoup
import requests
import pymongo
client=pymongo.MongoClient('localhost',27017)
Xiaozhu=client['Xiaozhu']
Href=Xiaozhu['href']
def MainPageInformation(url):
self_url=url
pageData=requests.get(self_url)
data=BeautifulSoup(pageData.text,'lxml')
href=data.select(' ul > li > a[class="resule_img_a"]')
for i in href:
data={
'href':i.get('href'),
}
Href.insert_one(data)
getPageInformation.py
函数功能:获取租房信息(可断点续接)
from bs4 import BeautifulSoup
import requests
import pymongo
client=pymongo.MongoClient('localhost',27017)
Xiaozhu=client['Xiaozhu']
Information=Xiaozhu['Information']
Href=Xiaozhu['href']
def getPageInformation(url):
self_url=url
self_pageData=requests.get(url)
self_data=BeautifulSoup(self_pageData.text,'lxml')
# print(self_data)
titles=self_data.select('div.pho_info > h4 > em')
roomImages=self_data.select('#curBigImage')
prices=self_data.select("div.day_l > span")
addresses=self_data.select('div.pho_info > p > span.pr5')
hosterImages=self_data.select('div.member_pic > a > img')
hosterName=self_data.select('div.w_240 > h6 > a')
# print(titles)
data={}
for title,roomImage,price,address,hosterImage,name in zip(titles,roomImages,prices,addresses,hosterImages,hosterName):
data={
'title':title.get_text(),
'roomImage':roomImage.get('src'),
'price':price.get_text(),
'address':address.get_text().strip(),
'hosterImage':hosterImage.get('src'),
'hosterName':name.get_text(),
'href':url
}
Information.insert_one(data)
def getInformation():
x=Href.find()
y=Information.find(fields={'href':True,'_id':False})
x_href=set(x)
y_href=set(y)
z=x-y
for i in z:
getPageInformation(i['href'])
# print(titles)
# print("-------------------------------------------")
# print(roomImages)
# print("-------------------------------------------")
# print(price)
# print("-------------------------------------------")
# print(address)
# print("-------------------------------------------")
# print(hosterImage)
# url='http://bj.xiaozhu.com/fangzi/269024901.html'
getInformation()