frombs4importBeautifulSoup
importrequests
#准备网络连接
#pc端
urls=['http://bj.lianjia.com/zufang/pg{}/'.format(str(i))foriinrange(1,101)]
#手机端
murls=['http://m.lianjia.com/bj/zufang/pg{}'.format(str(i))foriinrange(1,101)]
#爬取PC端的数据
defpachongpc(url):
web_date = requests.get(url)
web_date.encoding ='utf-8'
soup = BeautifulSoup(web_date.text,'lxml')
names = soup.select('#house-lst > li > div.info-panel > h2 > a')
adrs = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > a > span')
styles = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > span.zone > span')
areas = soup.select('#house-lst > li > div.info-panel > div.col-1 > div.where > span.meters')
prices = soup.select('#house-lst > li > div.info-panel > div.col-3 > div.price > span')
times = soup.select('#house-lst > li > div.info-panel > div.col-3 > div.price-pre')
imgs = soup.select('#house-lst > li > div.pic-panel > a > img')
data = []
# print(len(names),len(adrs),len(styles),len(areas),len(prices),len(times),len(imgs))
forname, adr, style, area, price, time, imginzip(names, adrs, styles, areas, prices, times, imgs):
info = {
#'name': name.get_text().replace(u'\xa0\xa0', u'').split(' ')[0],
'adr': adr.get_text().replace(u'\xa0\xa0',u''),
'style': style.get_text().replace(u'\xa0\xa0',u''),
'area': area.get_text().replace(u'\xa0\xa0',u''),
'price': price.get_text().replace(u'\xa0\xa0',u''),
'time': time.get_text().replace(u'\xa0\xa0',u''),
'img': img.get('src').replace(u'\xa0\xa0',u'')
}
data.append(info)
print(data)
#爬取mobile端的数据
defpachongmo(url):
web_date = requests.get(url)
web_date.encoding ='utf-8'
soup = BeautifulSoup(web_date.text,'lxml')
names = soup.select('section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_main.text_cut')
adrs = soup.select(' section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_other')
styles = soup.select(' section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_minor > div.info')
prices = soup.select('div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.item_minor > div.price_total.q_rentprice')
cate = soup.select('section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.item_list > div.tag_box')
imgs = soup.select('section.page.page_zufang > div > div.mod_box.house_lists > div.mod_cont > ul > li > div > div.mod_media > div > img')
data = []
# print(len(names),len(adrs),len(styles),len(areas),len(prices),len(times),len(imgs))
forname, adr, style, price, cate, imginzip(names, adrs, styles, prices, cate, imgs):
info = {
#'name': name.get_text().replace(u'\xa0\xa0', u'').split(' ')[0],
'adr': adr.get_text().replace(u'\xa0\xa0',u''),
'style': style.get_text().replace(u'\xa0\xa0',u''),
'price': price.get_text().replace(u'\xa0\xa0',u''),
'cate':list(cate.stripped_strings),
'img': img.get('src').replace(u'\xa0\xa0',u'')
}
data.append(info)
print(data)
if__name__ =="__main__":
forurlinurls:
pachongpc(url)
formurlinmurls:
pachongmo(murl)
注:可以使用标签加方括号的方式来定位到某一个特定的标签‘div>div.property_title>a[target='_blank']’