使用的库,参考
Requests库的用法
xpath
csv
from lxml import etree
import requests
import time
import csv
首先在网页中找到 需要的信息
首先,登录boss直聘网,搜索"数据分析"。打开网页检查器并刷新,找到网页的标头也就是header信息。
req_url根据页数有相应的变化,headers 把网页中的信息直接拷贝下来就可以了。得到request 返回的结果
def gethtml(i):
# requests 相关数据
req_url =( 'https://www.zhipin.com/c100010000/?query=数据分析&page=%s&ka=page-%s' %(i,i))
headers = {
"Cookie": "Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1548331018; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1547734371,1547971749,1548077779,1548208490; __a=4592978.1541328694.1548077779.1548208490.873.10.12.457; __l=l=%2Fwww.zhipin.com%2F&r=; JSESSIONID=""; toUrl=http%3A%2F%2Fwww.zhipin.com%2Fc100010000%2F%3Fquery%3D%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%26ka%3Dsel-city-100010000; lastCity=101010100; __c=1548208490; __g=-; _uab_collina=154169007228349597439633",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Host": "www.zhipin.com",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15",
"Accept-Language": "zh-cn",
"Accept-Encoding": "br, gzip, deflate",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Connection": "keep-alive"
}
用xpath解析html需要先将html保存在etree中
#返回职位的dom集
def getcompanys(req_result):
dom = etree.HTML(req_result.text)
company_path = "//div[@class='job-primary']"
companys = dom.xpath(company_path)
return companys
#返回职位detail字典
def getdetails(companys):
#使用 xpath 的到字段
positionName = c.xpath(".//h3[@class='name']//div[@class='job-title']/text()")[0]
salary = c.xpath(".//h3[@class='name']//span/text()")[0]
city = c.xpath(".//p/text()")[0]
experience = c.xpath(".//p/text()")[1]
education = c.xpath(".//p/text()")[2]
companyShortName = c.xpath(".//div[@class='info-company']//a/text()")[0]
industryField = c.xpath(".//div[@class='info-company']//p/text()")[0]
financeStage= c.xpath(".//div[@class='info-company']//p/text()")[1]
companySize= c.xpath(".//div[@class='info-company']//p/text()")[2]
positionId = c.xpath(".//div[@class='info-primary']//a/@data-jobid")
details = {
'positionId':positionId,
'positionName': positionName,
'salary': salary,
'city': city,
'experience': experience,
'education': education,
'companyShortName': companyShortName,
'industryField': industryField,
'financeStage': financeStage,
'companySize': companySize
}
return details
保存在csv文件中
#保存csv文件
with open('data.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['positionId','positionName', 'salary','city','experience', 'education','companyShortName','industryField','financeStage','companySize']
csvobj = csv.DictWriter(csvfile, fieldnames=fieldnames)
csvobj.writeheader()
for i in range(1,10):
req_result = gethtml(i)
companys = getcompanys(req_result)
for c in companys:
details = getdetails(c)
#写入csv文件
csvobj.writerow(details)
得到的文件: