importrequests
frombs4importBeautifulSoup
importtime
url='http://bj.xiaozhu.com/fangzi/597754001.html'
defhouse_info(url,data=None):
wb_data=requests.get(url)
time.sleep(1)
soup=BeautifulSoup(wb_data.text,'lxml')
titles=soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
addresses=soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')
prices=soup.select(' div.day_l > span')
house_images=soup.find_all(id='curBigImage')
#print(house_images[0].get('src'))
owner_images=soup.select(' div.js_box.clearfix > div.member_pic > a > img')
#print(owner_images)
sexs=soup.select('div.js_box.clearfix > div.member_pic > div')
sex1=[]
forsexinsexs:
sex=str(sex)
if('member_ico1'insex):
sex1.append('女')
elif('member_ico'insexand'member_ico1'not insex):
sex1.append('男')
else:
sex1.append('空')
owner_names=soup.select(' div.js_box.clearfix > div.w_240 > h6 > a')
#print(owner_names[0].get_text())
fortitle,address,price,house_image,owner_image,owner_name,sexinzip(titles,addresses,prices,house_images,owner_images,owner_names,sex1):
data={
'title':title.get_text(),
'address':address.get_text().rstrip(),
'price':price.get_text(),
'house_image':house_image.get('src'),
'owner_image':owner_image.get('src'),
'owner_name':owner_name.get_text(),
'sex':sex
}
print(data)
url1=['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i))foriinrange(1,12)]
defmulti_info(url):
wb_data=requests.get(url)
soup=BeautifulSoup(wb_data.text,'lxml')
page_url=soup.select('a.resule_img_a')
#print(page_url[0])
#page_url2=page_url[0].find_all('a',href=re.compile(r"/fangzi/\d\.html"))
#print(page_url2)
forpage_url1inpage_url:
house_info(str(page_url1['href']))
forurl2inurl1:
multi_info(url2)