Week2_Practice3

getHref.py

函数功能:获取所有的租房链接


from getMainPageInformation import *
from bs4 import  BeautifulSoup
import requests


url1='http://bj.xiaozhu.com/'
urls=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(2,20,1)]

sourceData=[]
hreflist=MainPageInformation(url1)
for i in urls:
   MainPageInformation(i)


getMainPageInformation.py

函数功能:获取租房链接

from bs4 import  BeautifulSoup
import requests
import pymongo

client=pymongo.MongoClient('localhost',27017)
Xiaozhu=client['Xiaozhu']
Href=Xiaozhu['href']

def MainPageInformation(url):
    self_url=url
    pageData=requests.get(self_url)
    data=BeautifulSoup(pageData.text,'lxml')
    href=data.select(' ul > li > a[class="resule_img_a"]')

    for i in href:
        data={
            'href':i.get('href'),
        }
        Href.insert_one(data)



getPageInformation.py

函数功能:获取租房信息(可断点续接)

from bs4 import  BeautifulSoup
import requests
import pymongo

client=pymongo.MongoClient('localhost',27017)
Xiaozhu=client['Xiaozhu']
Information=Xiaozhu['Information']
Href=Xiaozhu['href']

def getPageInformation(url):
    self_url=url
    self_pageData=requests.get(url)
    self_data=BeautifulSoup(self_pageData.text,'lxml')
    # print(self_data)
    titles=self_data.select('div.pho_info > h4 > em')
    roomImages=self_data.select('#curBigImage')
    prices=self_data.select("div.day_l > span")
    addresses=self_data.select('div.pho_info > p > span.pr5')
    hosterImages=self_data.select('div.member_pic > a > img')
    hosterName=self_data.select('div.w_240 > h6 > a')
    # print(titles)
    data={}
    for title,roomImage,price,address,hosterImage,name in zip(titles,roomImages,prices,addresses,hosterImages,hosterName):
        data={
            'title':title.get_text(),
            'roomImage':roomImage.get('src'),
            'price':price.get_text(),
            'address':address.get_text().strip(),
            'hosterImage':hosterImage.get('src'),
            'hosterName':name.get_text(),
            'href':url
        }
    Information.insert_one(data)

def getInformation():
    x=Href.find()
    y=Information.find(fields={'href':True,'_id':False})
    x_href=set(x)
    y_href=set(y)
    z=x-y
    for i in z:
        getPageInformation(i['href'])
    # print(titles)
    # print("-------------------------------------------")
    # print(roomImages)
    # print("-------------------------------------------")
    # print(price)
    # print("-------------------------------------------")
    # print(address)
    # print("-------------------------------------------")
    # print(hosterImage)



# url='http://bj.xiaozhu.com/fangzi/269024901.html'
getInformation()
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容