爬取的网页
import requests
from lxml import etree
import json
url = "https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
里面有空格要处理下
header = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9",
"Connection":"keep-alive",
"Host":"search.51job.com",
"Sec-Fetch-Mode":"navigate",
"Sec-Fetch-Site":"none",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"}
网页处理
response = requests.get(url=url,headers=header)
response.encoding = "gbk"
html_51job = etree.HTML(response.text)
all_div = html_51job.xpath("//div[@id='resultList']//div[@class='el']")
info_list = []
for item in all_div:
info = {}
# 这个非常重要,代表我们使用的是item下的xpath语句,不要把 . 丢了
# 获取数据的时候,要使用列表索引为0的数据
info['job_name'] = item.xpath("./p/span/a/@title")[0]
info['company_name'] = item.xpath(".//span[@class='t2']/a/@title")[0]
info['company_address'] = item.xpath(".//span[@class='t3']/text()")
# money 字段可能为空, try, except来进行异常处理
try:
info['money'] = item.xpath(".//span[@class='t4']/text()")[0]
except IndexError:
info['money'] = '无数据'
info['date'] = item.xpath(".//span[@class='t5']/text()")[0]
info_list.append(info)
return info_list
print(json.dumps(info_list))