1,基础知识
- step1:使用requests.get()获取网页源代码
- step2: 使用BeautifulSoup解析网页
2,自己试着爬取小猪租房的房源信息
注:题目要求爬取300个房源信息,为节约时间及内存,我仅试着爬了两页。
- The Result:
- The Code:
from bs4 import BeautifulSoup
import requests
urls = ["http://bj.xiaozhu.com/search-duanzufang-p{}-0/".format(number) for number in range(1, 3)]
#print(urls)
def get_details(url, data = None):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
ads = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
big_imgs = soup.select('#curBigImage')
prices = soup.select('#pricePart > div.day_l > span')
names = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
member_pics = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
sexes = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
for title, ad, big_img, price, name, member_pic, member_sex in zip(titles, ads, big_imgs, prices, names, member_pics, sexes):
title = title.get_text()
ad = ad.get_text()
big_img = big_img.get('src')
price = price.get_text()
name = name.get('title')
member_pic = member_pic.get('src')
member_sex = judge_sex(member_sex.get('class')[0])
print(title, '地址:',ad, '图片:',big_img, '价格(RMB):',price, '房东姓名:', name, '房东美皂:',member_pic, '房东性别:', member_sex )
def judge_sex(sex):
if sex == 'member_ico1':
return '女'
elif sex == 'member_ico':
return '男'
else:
return '未知'
def get_link(url, data = None):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
#print(soup)
links = soup.select('#page_list > ul > li > a')
#print('links = ', links)
for link in links:
single_link = link.get('href')
#print(single_link)
get_details(single_link)
for single_url in urls:
get_link(single_url)
##page_list > ul > li:nth-child(3) > a
#http://bj.xiaozhu.com/search-duanzufang-p1-0/
#body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em
3,反思与总结
- 对函数运用更加熟练
- copy selector的时候看看是否有其他可供选择的地方
- 第一次失败的原因是手打错了链接,浪费了很多时间找错误,今后还是应该用“复制-粘贴”的方式填写链接
- if, elif, else后面有“:”