今天写的爬虫是为了抓去拉钩网上面的招聘信息,代码如下:
from bs4 import BeautifulSoup
import requests
import time
def get_jobs_id(content):
ids_list = []
for value in content:
ids_list.append(value['positionId'])
return ids_list
def get_links(content):
link_list = []
ids_list = get_jobs_id(content)
for item in ids_list:
link = 'http://www.lagou.com/jobs/' + str(item) + '.html'
link_list.append(link)
return link_list
def get_jobs_info(content):
link_list = get_links(content)
data_list = []
for link in link_list:
time.sleep(1)
url = requests.get(link)
soup = BeautifulSoup(url.text, 'lxml')
company_names = soup.select('dt > h1 > div')
moneys = soup.select('dd.job_request')
details = soup.select('dd.job_bt')
addrs = soup.select('div.work_addr')
for company, money, detail, addr in zip(company_names, moneys, details, addrs):
data = {
'company': ''.join(company.get_text().split()),
'money': ','.join(money.get_text().split()),
'detail': ''.join(detail.get_text().split()),
'addr': ''.join(addr.get_text().split())
}
data_list.append(data)
return data_list
url = 'http://www.lagou.com/jobs/positionAjax.json'
page_number = 1
while page_number <= 3:
post_data = {'first': 'true', 'kd': '运维工程师', 'pn': '1', 'city': '成都'}
post_data.update({'pn': str(page_number)})
r = requests.post(url, data=post_data)
contents = r.json()['content']['positionResult']['result']
jobs = get_jobs_info(contents)
print(jobs)
page_number += 1
思路:
在fire-bug下面发现搜索职业后会从浏览器上传了几个参数到拉勾的服务器一个是 first =true, 一个是kd = android, (关键字) 一个是pn =1 (page number 页码
这样就获得了json数据,数据中有一个positionId是招聘具体信息的id,这样就获得了具体的url链接,然后访问链接获得具体信息。
执行效率不高,希望后续能够修改!