六、续上篇 xpath – 实战–爬取瓜子二手车网站
# 获取详情页面url
def get_detail_urls(url): # 定义函数,用来获取详情页面url
resp = requests.get(url, headers=headers)
text = resp.content.decode('utf-8')
html = etree.HTML(text)
ul = html.xpath('//ul[@class = "carlist clearfix js-top"]')[0] #此处有bug,待解决
# print(ul)
lis = ul.xpath('./li') # 当前节点下用 ./ 来解析
detail_urls = [ ]
for li in lis:
detail_url = li.xpath('./a/@href')
detail_url = 'https://www.guazi.com' + detail_url[0] # 观察详情页与获取数据的区别,需要拼接
# print(detail_url)
detail_urls.append(detail_url)
return detail_urls
# 解析详情页面内容
def parse_detail_page(url):
resp = requests.get(detail_url, headers=headers)
text = resp.content.decode('utf-8')
html = etree.HTML(text)
title = html.xpath('//div[@class = "product-textbox"]/h2/text()')[0]
title = title.replace(r'\r\n', ' ').strip() # 去空格等不需要的格式
# print(title)
info = html.xpath('//div[@class = "product-textbox"]/ul/li/span/text()')
# print(info)
infos = {}
cardtime = info[0]
km = info[1]
displacement = info[2]
speedbox = info[3]
infos['title'] = title
infos['cardtime'] = cardtime
infos['km'] = km
infos['displacement'] = displacement
infos['speedbox'] = speedbox
print(infos)
# 第一个url
url = 'https://www.guazi.com/cs/buy/o1'
# 获取详情页面url
detail_urls = get_detail_urls(url) #此处有bug,待解决
# 解析详情页面内容
for detail_url in detail_urls:
parse_detail_page(detail_url)
# 保存数据
# https://www.guazi.com/cs/5217eaea382dddc3x.htm#fr_page=list&fr_pos=city&fr_no=4
上一篇文章 第三章 数据解析(五) 2019-12-15 地址:
https://www.jianshu.com/p/dda1a316a8ae
下一篇文章 第三章 数据解析(七) 2019-12-17 地址:
https://www.jianshu.com/p/32b4cadbf9b7
以上资料内容来源网络,仅供学习交流,侵删请私信我,谢谢。