背景描述:根据给定excel中的日企名称,在招聘网站上搜索所有岗位信息,并最终生成excel表格。
使用到的框架:scrapy
数据库:mysql
难点:部分信息需要爬取二级页面及分页,暂未实现,待更新。
原始表格:
需要生成的列表项:
单位名称、岗位名称、学历要求、工作年限要求、工作地点、岗位年薪、招聘人数、到岗时间、任职要求
爬取部分代码实现:
import array
import time
import scrapy
import json
from qcwy.items import QcwyItem
from openpyxl import load_workbook
from openpyxl import Workbook
from urllib.request import urlopen
from urllib.request import Request
from urllib import parse
from lxml import etree
import sys
from bs4 import BeautifulSoup
from openpyxl import Workbook
import xlwt
import requests
import json
from bs4 import BeautifulSoup
from xlrd import open_workbook
import time
from xlutils.copy import copy
class QcwycrawlerSpider(scrapy.Spider):
name = 'qcwyCrawler'
# allowed_domains = ['www.xxx.com']
start_urls = [] # start_urls列表中的url会被scrapy自动请求
def __init__(self, **kwargs):
super().__init__(**kwargs)
#read Excel
setSQLData = []
ws = load_workbook('D:/code/xxx.xlsx')
Sheet_1 = ws['Sheet1']
for i in range(1,106):
B1 = Sheet_1.cell(i,1).value
if B1 is None:
continue
else:
num = 0
url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,'+B1+',1,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
#for i in range(1, 10):
self.start_urls.append(url)
def parse(self, response): # 利用xpath和json解析爬取到的数据
json_str = response.xpath('/html/body/script[2]/text()').extract_first()[29:]
data = json.loads(json_str)
item = QcwyItem()
if len(data['engine_search_result'])>0:
for row in data['engine_search_result']:
item['company'] = row['company_name']
item['job_name'] = row['job_name']
item['salary'] = row['providesalary_text']
attribute_text = len(row['attribute_text'])
if attribute_text > 3:
item['experience'] = row['attribute_text'][1]
item['education'] = row['attribute_text'][2]
item['num'] = row['attribute_text'][3]
if attribute_text == 3:
item['experience'] = row['attribute_text'][1]
item['num'] = row['attribute_text'][2]
item['education'] = '无要求'
item['welfare'] = row['jobwelf']
item['workarea']= row['workarea_text']
yield item
else:
item['company'] = data['searched_condition']
item['job_name'] = '无'
item['salary'] = '无'
item['welfare'] = '无'
item['workarea']= '无'
item['education'] = '无'
item['num'] = '0'
item['experience'] = '无'
yield item