实现了从租房网站爬取信息的功能
下面是效果图
#!python
from bs4 import BeautifulSoup
import requests
import time
# 构造主页的页面
PageUrl = ["http://bj.xiaozhu.com/danjian-duanzufang-p{}-2/".format(str(i)) for i in range(1, 13, 1)]
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Cookie': '__cfduid=d4ec4fb3066eb487805416f4176347d081470564018; PHPSESSID=q0q2gacrhkjfb0aoj13p50j843; HstCfa3517653=1470564023815; HstCmu3517653=1470564023815; HstCla3517653=1470573628450; HstPn3517653=15; HstPt3517653=15; HstCnv3517653=1; HstCns3517653=6'
}
infos = []
def Get_item_list(url, data=None):
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
links = soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname')
for link in links:
Get_item_detail_info(link.get('detailurl'))
def Get_item_detail_info(url, data=None):
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
addresss = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
prices = soup.select('#pricePart > div.day_l > span')
picSrcs = soup.select('#curBigImage')
avartars = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
names = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
sexs = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
for title, address, price, picsrc, avartar, name, sex in zip(titles, addresss, prices, picSrcs, avartars, names,
sexs):
data = {
'title': title.get_text(),
'address': address.get_text(),
'price': price.get_text(),
'picsrc': picsrc.get('src'),
'avartar': avartar.get('src'),
'name': name.get_text(),
'sex': get_sex(sex.get('class'))
}
infos.append(data)
print(data)
def get_sex(text):
if text == ['member_ico1']:
return '女'
else:
return '男'
for ul in PageUrl:
Get_item_list(ul)
time.sleep(2)
总结
- 有时候并不是单纯的获取txt信息,需要根据其内容判断正确的内容
- 特别注意大规模爬取数据的时候需要设置定时器,防止速度过快被封
- 个人感觉还是传递header和cookie模仿浏览器可能更好一点!