今天的爬虫比较简单,就不做详细介绍。爬取的是某网站的租房信息。
from bs4 import BeautifulSoup
import requests
import time
base_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(14)]
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'}
def get_urls(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
urls = soup.select('a[class="resule_img_a"]')
for url in urls:
href = url.get('href')
get_details(href)
time.sleep(1)
def get_details(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
title = soup.select('div.pho_info > h4 > em')
addr = soup.select('p > span.pr5')
price = soup.select('div.day_l > span')
image = soup.select('div.pho_show_l > div > div > img')
owner_img = soup.select('div.member_pic > a > img')
sex = soup.select('div.member_pic > div.member_ico1')
name = soup.select('div.w_240 > h6 > a')
for t, a, p, i, o, n in zip(title, addr, price, image, owner_img, name):
data = {
'title': t.get_text(),
'address': ''.join(a.get_text().split()),
'price': p.get_text(),
'img': i.get('src'),
'owner_img': o.get('src'),
'name': n.get_text()
}
if len(sex):
data['sex'] = 'female'
else:
data['sex'] = 'male'
# return data
print(data)
for base_url in base_urls:
get_urls(base_url)
time.sleep(2)
自己电脑网速太慢一直爬不完,最后使用AWS主机才爬完300多条记录。