# 目标url: https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
# post请求要提交的表单数据
from urllib import request,parse
import json,pymysql,time
def lagouspider(url,formdata):
# 发起请求返回响应结果
response_data = load_page_data(url,formdata)
# 得到一个json数据,需返回一个python类型的数据
data = json.loads(response_data)
if data['success']:
print('请求成功')
# 拿到职位信息
postionJobs = data['content']['positionResult']['result']
for jobinfo in postionJobs:
jobdata = {}
jobdata['positionName'] = jobinfo['positionName']
jobdata['publishTime'] = jobinfo['formatCreateTime']
jobdata['companyname'] = jobinfo['companyShortName']
jobdata['salary'] = jobinfo['salary']
jobdata['worYear'] = jobinfo['workYear']
jobdata['education'] = jobinfo['education']
jobdata['industry'] = jobinfo['industryField']
jobdata['stage'] = jobinfo['financeStage']
jobdata['companySize'] = jobinfo['companySize']
jobdata['fuli'] = ','.join(jobinfo['companyLabelList'])
jobdata['positionAdvantage'] = ','.join(jobinfo['positionAdvantage'])
# 存数据
save_data_to_db(jobdata)
# 判断是否需要发起下一次请求
# 取出当前页码
cur_page = int(data['content']['pageNo'])
# 每页多少条
page_size = int(data['content']['pageSize'])
#职位总数
totalcount = int(data['content']['positionResult']['totalCount'])
if cur_page*page_size < totalcount:
next_page = cur_page +1
print('继续发起请求第'+str(next_page)+'页')
formdata['pn'] = next_page
lagouspider(url,formdata)
else:
print('请求不成功,请稍后尝试')
time.sleep(10)
print('重新发起第'+formdata['pn']+'页请求')
lagouspider(url,formdata)
def load_page_data(url,formdata):
"""
发起请求
:param url:
:param formdata:
:return:
"""
# 将表单数据转为web服务器可以识别的url编码格式的bytes类型的数据
form_data = parse.urlencode(formdata).encode('utf-8')
req_header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Referer':'https://www.lagou.com/jobs/list_c%2B%2B?labelWords=&fromSearch=true&suginput=',
}
# 构建一个request对象
req = request.Request(url, headers=req_header,data=formdata)
# 根据Request对象发起请求
response = request.urlopen(req)
if response.status == 200:
return response.read().decode('utf-8')
def save_data_to_db(jobdata):
"""
存储数据
:param jobdata:
:return:
"""
sql = """
INSERT INTO lagou(%s)
VALUE (%s)
"""%(','.join(jobdata.keys()),','.join(['%s']*len(jobdata)))
try:
cursor.execute(sql,list(jobdata.values()))
mysql_client.commit()
except Exception as err:
print(err)
mysql_client.rollback()
if __name__ == '__main__':
# 创建数据库链接
"""
host=None, user=None, password="",
database=None, port=0, unix_socket=None,
charset=''
"""
mysql_client = pymysql.Connect('127.0.0.1','root','18603503110','1712B',3306,charset='utf8')
# 创建游标(执行mysql语句)
cursor = mysql_client.cursor()
# 目标
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
# 要提交的参数
formdata = {
'first':'true',
'pn':1,
'kd':'c++',
}
lagouspider(url,formdata)
网络爬虫:urllib模块应用7--拉钩
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...