测试1-2 抓取300个房租信息
from bs4 import BeautifulSoup
import requests
import time
# 单页方法:
url = 'http://wh.xiaozhu.com/fangzi/1947701970.html'
wd_data = requests.get(url)
soup = BeautifulSoup(wd_data.text, 'lxml')
def fd_sex(class):
if class_name == 'member_ico1':
return '男'
else:
return '女'
data = {
'title': soup.select('h4 em')[0].text,
'addres': soup.select('span.pr5')[0].text.strip().split(' ')[0],
'day_prices': int(soup.select('div.day_l span')[0].text),
'imgs': (soup.select('img#curBigImage')[0].get('src')),
'fd_imgs': soup.select('div.member_pic a img')[0].get('src'),
'fd_name': soup.select('a.lorder_name')[0].get('title'),
'fd_sex': fd_sex(soup.select('div.member_ico1')[0].get('class'))
}
# 多页方法
page_link = []
def list_xiaozu_url(pages):
page_link.clear()
for each_number in range(1, pages):
xiaozu_url = 'http://wh.xiaozhu.com/search-duanzufang-p{}-0/'.format(each_number)
time.sleep(4)
wd_data = requests.get(xiaozu_url)
soup = BeautifulSoup(wd_data.text, 'lxml')
for link in soup.select('a.resule_img_a'):
url_list = link.get('href')
page_link.append(url_list)
print(page_link)
list_xiaozu_url(13)
第一件事
抓取300个出租连接(完成)第二件事
写需要抓取信息的代码 并防止反扒 抓取信息工整(未完成)第三件事
统计数据