from bs4 import BeautifulSoup
import requests
import re
def singal_url(url_arg):
urls = []
info = []
wb_data = requests.get(url_arg)
soup = BeautifulSoup(wb_data.text, 'lxml')
urls_temp = soup.select('#page_list > ul > li > a')
for url in urls_temp:
urls.append(url.get('href'))
#print(urls)
for url in urls:
singal_wb_data = requests.get(url)
soup = BeautifulSoup(singal_wb_data.text, 'lxml')
title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')[0].get_text()
address = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')[0].get_text().replace(' ','')[:-1]
price = soup.select('#pricePart > div.day_l > span')[0].get_text()
house_image = soup.select('#detailImageBox > div.pho_show_r > div > ul > li:nth-of-type(2) > img')[0].get('data-src')
owner_image = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')[0].get('src')
owner_name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0].get_text()
gender = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0].get('class')[0]
#print(type(gender[0]))
if gender == 'member_ico':
gender = 'man'
elif gender == 'member_ico1':
gender = 'woman'
else:
gender = ''
dic = {
'title':title,
'address':address,
'price':price,
'house_image':house_image,
'owner_image':owner_image,
'owner_name':owner_name,
'gender':gender
}
print(dic)
info.append(dic)
return info
def some_url(urls_arg):
urls = []
info = []
for url_arg_temp in urls_arg:
wb_data = requests.get(url_arg_temp)
soup = BeautifulSoup(wb_data.text, 'lxml')
urls_temp = soup.select('#page_list > ul > li > a')
for url in urls_temp:
urls.append(url.get('href'))
print(urls)
for url in urls:
singal_wb_data = requests.get(url)
soup = BeautifulSoup(singal_wb_data.text, 'lxml')
title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')[0].get_text()
address = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')[
0].get_text().replace(' ', '')[:-1]
price = soup.select('#pricePart > div.day_l > span')[0].get_text()
house_image = soup.select('#detailImageBox > div.pho_show_r > div > ul > li:nth-of-type(2) > img')[0].get(
'data-src')
owner_image = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')[0].get('src')
owner_name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0].get_text()
gender = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0].get('class')[0]
# print(type(gender[0]))
if gender == 'member_ico':
gender = 'man'
elif gender == 'member_ico1':
gender = 'woman'
else:
gender = ''
dic = {
'title': title,
'address': address,
'price': price,
'house_image': house_image,
'owner_image': owner_image,
'owner_name': owner_name,
'gender': gender
}
print(dic)
info.append(dic)
return info
'''
单张列表页
'''
# url = 'http://bj.xiaozhu.com/search-duanzufang-p1-0/'
# singal_info = singal_url(url)
# print(singal_info)
'''
多张列表页
'''
# urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) for i in range(1,301,1)]
# some_info = some_url(urls)
# print(some_info)
总结
- soup.select('XXXXXXXXXXX').get('class')出来的是list,get('src')出来的是str
- requests.get(url)获取真实网页后,Beautifulsoup('XXX'.text, 'lxml')使可读