- 目的:爬取智联招聘上全国的所有职位信息
职位信息包括:职位名称、福利、待遇、薪水、工作地点、工作经验、学历要求等 -
流程
包括分析页面和编写代码两部分
-
分析页面
1.主页面由职业分类组成,找到对应id图二
2.副页面由职位列表和下一页组成
注意/table//tbody/tr/中,通过body是浏览器自动加上的,选的时候去掉
3.职位详情页面
- 编写代码
1.获取职位分类列表
from lxml import etree
import requests
import re
def main_Web(url):
res = requests.get(url,headers=head).content.decode("utf-8")
r = etree.HTML(res)
u_list = r.xpath("//div[@id='search_right_demo']/div/div/a/@href")
pattern = re.compile("jl=\d+&")
url_list = [url[:-1]+pattern.sub("jl=489&",i) for i in u_list]
return url_list
2.获取职位列表
def sub_Web(url):
res = requests.get(url,headers=head).content.decode("utf-8")
# print(res)
r = etree.HTML(res)
# print(r)
u_list = r.xpath("//div[@class='newlist_list_content']/table/tr/td/div/a[1]/@href")
next_page = r.xpath("//a[@class='next-page']/@href")
return u_list,next_page
3.获取职位详情
def sub_sub_Web(url):
res = requests.get(url, headers=head).content.decode("utf-8")
r = etree.HTML(res)
title = r.xpath("string(//div[@class='fixed-inner-box'][1]/div[1]/h1)")
weflare = r.xpath("//div[@class='fixed-inner-box'][1]/div[1]/div[1]/span/text()")
salary = r.xpath("string(//ul[@class='terminal-ul clearfix']/li[1]/strong)")
address = r.xpath("string(//ul[@class='terminal-ul clearfix']/li[2]/strong)")
return title,str(weflare),salary,address
4.保存
def save_data(title,weflare,salary,address):
with open("job.csv","a+",encoding="utf-8") as file:
file.write("公司名称:"+title+"\n")
file.write("福利:"+weflare+"\n")
file.write("薪水:"+salary+"\n")
file.write("地点:"+address+"\n")
file.write("***"*25+"\n")
5.主函数
if __name__ == '__main__':
url = "https://sou.zhaopin.com/"
head ={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
}
#1.获取职位分类列表
url_list = main_Web(url)
#2.获取职位列表
for x in range(5):
job_list,next_page = sub_Web(url_list[x])
for i in range(len(job_list)):
#3.获取职位详情
title, weflare, salary, address=sub_sub_Web(job_list[i])
#4.保存
save_data(title, weflare, salary, address)