爬虫实战第二天
任务
爬取小猪短租杭州地区的房源信息:
成果
共爬取284条信息,并保存到.xls文件中,写入时还有些编码的问题。
源码
from bs4 import BeautifulSoup
import requests
pages = ['http://hz.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, 14)]
info = []
def get_info(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
data = {
'title': soup.select('div.pho_info > h4 > em')[0].get_text(),
'address': soup.select('div.pho_info > p > span')[0].get_text().strip(' ').strip('\n'),
'price': soup.select('#pricePart > div.day_l > span')[0].get_text() + '/' + soup.select('#pricePart > div.day_r')[0].get_text(),
'house_image': soup.select('#curBigImage')[0]['src'], #图片链接在chrome中不是直接打开而是下载,在IE中可以直接打开
'master_name': soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0]['title'],
'master_sex': soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > span')[0]['class'][0].split('_')[1],
'master_image': soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')[0]['src']
}
print(data)
info.append(data)
def get_url(start_url):
wb_data = requests.get(start_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
urls = soup.select('#page_list > ul > li > a')
return urls
for page in pages:
urls = get_url(page)
for url in urls:
try:
get_info(url['href'])
except Exception as e:
pass
with open('xiaozhu.xls', 'w') as f:
for i in info:
for key in i:
try:
f.write(key)
f.write('\t')
f.write(i[key])
f.write('\t')
except Exception as e:
break
f.write('\n')
小结
- 数据写入的编码问题待研究。