找工作啊找工作
年关将至,相信很多小伙伴在寻思着发展方向。不知有没小伙伴跟笔者找工作时狂刷简历,为的是找出离家近点的公司,但是有很多公司的简介上是没有地址的,所以也只能再点进去看公司详细地址。
因此,写了个爬虫,方便找工作。
项目简介
主要代码是一个job_spider.py(用于爬虫), jobs_data_analyse.py(用于工作数据分析)
笔者先获取工作列表,取得简介后再取得详情。下载完成后,再进行分析。
详细代码如下
job_spider.py
from bs4 import BeautifulSoup
import requests
import os
from enum import Enum
from program import config
import pandas as pd
pd.set_option('expand_frame_repr', False) # 列太多不换行
class WEBTYPE(Enum):
_51job = '_51job' # 51job
zhilian = 'zhilian' # 智联
all = 3 # 所有
#全局变量 记录爬虫次数
SPIDER_REQUIRE_COUNT = 0
#获取51job地址编号对应地名
def get_51job_area_code():
dic = {}
for i in range(1, 37):
url = 'http://search.51job.com/list/{}0000,000000,0000,00,9,99,ios,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format('%02d' % i)
r = requests.get(url, headers=config.http_headers).content.decode('gbk')
area_name = BeautifulSoup(r, 'lxml').find(id="work_position_input")['value']
print(area_name, i)
dic[area_name] = i
file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')
print('51job地区编号文件获取成功')
with open(file_path, "w+", encoding="utf-8") as f:
f.write(str(dic))
f.close()
# 检查本地是否有51job地区编号 没有的话就自动获取
def check_area_name():
file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')
if os.path.exists(file_path):
with open(file_path, "r", encoding="utf-8") as f:
if f:
result = f.read()
dic = eval(result)
f.close()
else:
print('51job缺少地区编号文件,获取中')
get_51job_area_code()
check_area_name()
def fetch_data( web_type=WEBTYPE.all, keywords=['iOS'], page_count=5, area='深圳'):
if os.path.exists(config.jobs_data_path):
os.remove(config.jobs_data_path)
print('删除之前爬的数据')
if web_type == WEBTYPE._51job:
_fetch_data(web_type, keywords, page_count, area)
elif web_type == WEBTYPE.zhilian:
_fetch_data(web_type, keywords, page_count, area)
elif web_type == WEBTYPE.all:
for type in list(WEBTYPE)[0: -1]:
_fetch_data(type, keywords, page_count, area)
def _fetch_data(web_type, keywords, page_count, area):
df = fetch_job_introduce(web_type, keywords, page_count, area)
df = fetch_job_detail(df)
df.fillna(value='', inplace=True)
if os.path.exists(config.jobs_data_path):
df_existed = pd.read_csv(config.jobs_data_path, encoding='utf-8', index_col=0)
df = df.append(df_existed, ignore_index=True)
df.sort_values(by=['地区'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_csv(config.jobs_data_path, mode='w', encoding='utf-8')
#去除工作要求 方便查看
df_no_require = df.drop(['要求'], axis=1)
df_no_require['薪酬'] = df_no_require['薪酬'].apply(_make_introduce_beautiful, min_length=12)
df_no_require['地区'] = df_no_require['地区'].apply(_make_introduce_beautiful, min_length=12)
df_no_require['详细地址'] = df_no_require['详细地址'].apply(_make_introduce_beautiful, min_length=30)
df_no_require['链接'] = df_no_require['链接'].apply(_make_introduce_beautiful, min_length=60)
df_no_require.to_csv(config.jobs_data_introduce_path, mode='w', encoding='utf-8')
# 让简介好看点 左对齐并留空
def _make_introduce_beautiful(txt, min_length):
try:
return txt.ljust(min_length)
except Exception as e:
print(e)
return ''.ljust(min_length)
# 获取工作简介
def fetch_job_introduce(web_type, keywords, page_count, area):
url = ""
decode_type = ""
#根据不同网站设置不同的地址格式
area_need = ""
if web_type == WEBTYPE._51job:
url = "http://search.51job.com/list/{}0000,000000" \
",0000,00,9,99,{},2,{}.html? lang=c&stype=1&postchannel=0000&workyear=99&" \
"cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0" \
"&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
decode_type = 'gbk'
file_path = os.path.join(config.job_data_dir, '51job_area_code.txt')
with open(file_path, mode='r', encoding='utf-8') as f:
result = f.read()
dic = eval(result)
area_need = '%02d' % dic[area]
elif web_type == WEBTYPE.zhilian:
url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&isadv=0&sg=7e9e61449fd14593a5604fff81aec46a&p={}"
decode_type = "utf-8"
# 实际页数从1开始,所以+1
urls = [url.format(area_need,' '.join(keywords), p+1) for p in range(0, page_count)]
df = fetch_companies(urls, decode_type, web_type)
return df
def fetch_companies(urls, decode_type, web_type):
df = pd.DataFrame(columns=['薪酬', '地区', '详细地址', '链接', '工作', '公司', '来源', '要求'])
# 要页数从0开始
for url in urls:
r = requests.get(url, headers=config.http_headers).content.decode(decode_type)
if web_type == WEBTYPE._51job:
bs = BeautifulSoup(r, 'lxml').find("div", class_="dw_table").find_all("div", class_="el")
for b in bs:
try:
href, job_name = b.find('a')['href'], b.find('a')['title']
company_name = b.find('span', class_='t2').text
locate = b.find('span', class_='t3').text
salary = b.find('span', class_='t4').text
dic = {'工作': company_name,
'地区': locate,
'详细地址': '',
'薪酬': salary,
'公司': job_name,
'链接': href,
'来源': web_type.value,
'要求': ''}
index = df.shape[0]
df.loc[index] = dic
# print(df)
except Exception as e:
print(e, "简介解析错误")
pass
elif web_type == WEBTYPE.zhilian:
bs = BeautifulSoup(r, 'lxml').find(id="newlist_list_content_table").find_all("table",class_="newlist")
for b in bs:
try:
# 第一个标签没有信息
href = b.find("td", class_="zwmc").find("div").find("a")["href"]
company_name = b.find("td", class_="zwmc").find("div").find("a").text
job_name = b.find("td", class_='gsmc').find("a").text
locate = b.find("td", class_="gzdd").text
salary = b.find("td", class_="zwyx").text
dic = {'工作': company_name,
'地区': locate,
'详细地址': '',
'薪酬': salary,
'公司': job_name,
'链接': href,
'来源': web_type.value,
'要求': ''}
index = df.shape[0]
df.loc[index] = dic
# print(df)
except Exception as e:
print(e, "简介解析错误")
pass
return df
# 获取工作详情
def fetch_job_detail(df):
for i in range(0, df.shape[0]):
introduce = df.loc[i]
location, require = _fetch_location_and_require_from_detail(introduce)
df.loc[i, '详细地址'] = location
df.loc[i, '要求'] = require
return df
# 获取详细地址与工作要求详情
def _fetch_location_and_require_from_detail(introduce):
global SPIDER_REQUIRE_COUNT
web_type = introduce['来源']
href = introduce['链接']
company_name = introduce['公司']
if web_type == WEBTYPE._51job.value:
SPIDER_REQUIRE_COUNT += 1
print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href))
try:
r = requests.get(href, headers=config.http_headers).content.decode("gbk")
location_detail = _fetch_location_from_detail(r, introduce)
bs = BeautifulSoup(r, 'lxml').find('div', class_="bmsg job_msg inbox")
useless_bs1 = bs.find('p', class_='fp')
useless_bs2 = bs.find('div', class_='share')
require = bs.text.replace(useless_bs1.text, '').replace(useless_bs2.text, '')\
.replace("\t", "").replace("\n", "").replace("\r", "")
return location_detail, require
except Exception as e:
print(e, "工作要求解析错误")
return "", ""
pass
elif web_type == WEBTYPE.zhilian.value:
SPIDER_REQUIRE_COUNT += 1
print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href))
try:
r = requests.get(href, headers=config.http_headers).content.decode("utf-8")
location_detail = _fetch_location_from_detail(r, introduce)
bs = BeautifulSoup(r, 'lxml').find('div', class_="tab-inner-cont")
useless_bs1 = bs.find('b')
useless_bs2 = bs.find('h2')
useless_bs3 = bs.find(id='applyVacButton1')
require = bs.text.replace(useless_bs1.text, '').replace(useless_bs2.text, '').replace(useless_bs3.text, '')\
.replace("\t", "").replace("\n", "").replace("\r", "")
return location_detail, require
except Exception as e:
print(e, "工作要求解析错误")
return "", ""
pass
#获取详细地址
def _fetch_location_from_detail(h5_content, introduce):
"""获取公司详细地址"""
web_type = introduce['来源']
if web_type == WEBTYPE._51job.value:
bs = BeautifulSoup(h5_content, 'lxml').find_all('p', class_="fp")
for b in bs:
try:
location = b.text
if "上班地址" in location:
location = location.replace("上班地址:", "").replace("\t", "").replace("\n", "")
return location
except Exception as e:
print(e, '上班地址解析错误')
return introduce['地区']
pass
elif web_type == WEBTYPE.zhilian.value:
bs = BeautifulSoup(h5_content, 'lxml').find('div', class_="tab-inner-cont")
try:
location = bs.find("h2").text
location = location.replace("\t", "").replace("\n", "").replace("\r", "").replace(" ", "").replace("查看职位地图", "")
return location
except Exception as e:
print(e, '上班地址解析错误')
return introduce['地区']
pass
jobs_data_analyse.py
import os
from program import config
import pandas as pd
import math
import jieba
import jieba.posseg
import csv
import matplotlib.pyplot as plt
from program.job_spider import *
import numpy as np
from PIL import Image
from collections import Counter
from wordcloud import WordCloud
pd.set_option('expand_frame_repr', False)
def jobs_data_analyse():
df = pd.read_csv(config.jobs_data_path, encoding='utf-8')
df['薪酬'] = df['薪酬'].apply(unify_salary_form)
salary_analyse(df)
require_analyse(df)
#统一工资格式
def unify_salary_form(salary):
if type(salary) == float and math.isnan(salary):
return None
month = 1
if salary.endswith('/年'):
month = 12
salary = salary.replace('/年', '')
elif salary.endswith('/月'):
month = 1
salary = salary.replace('/月', '')
multiple = 1
if salary.endswith('千'):
multiple = 1000
salary = salary.replace('千', '')
elif salary.endswith('万'):
multiple = 10000
salary = salary.replace('万', '')
# print(salary)
try:
min = int(float(salary.split('-')[0]) * multiple / month)
max = int(float(salary.split('-')[1]) * multiple / month)
return str(min), str(max), str(min) + '-' + str(max)
except Exception as e:
print(e)
return None
#分析薪酬
def salary_analyse(df):
df['low_薪酬'] = df['薪酬'].apply(lambda x: None if(x == None) else int(x[0]))
df['high_薪酬'] = df['薪酬'].apply(lambda x: None if (x == None) else int(x[1]))
print('该行业平均工资为: ', df.dropna(subset=['薪酬'])[['low_薪酬', 'high_薪酬']].mean().mean())
index_max_salary = df['high_薪酬'].idxmax()
index_min_salary = df['low_薪酬'].idxmin()
print('最高薪酬的公司: %s 薪酬为: %d 链接如下\\n%s' % (df.loc[index_max_salary, '公司'], df['high_薪酬'].max(), df.loc[index_max_salary, '链接']))
print('最低薪酬的公司: %s 薪酬为: %d 链接如下\\n%s' % (df.loc[index_min_salary, '公司'], df['low_薪酬'].min(), df.loc[index_min_salary, '链接']))
for area, group in df.dropna(subset=['薪酬']).groupby('地区'):
average_salary = group[['low_薪酬', 'high_薪酬']].mean().mean()
print('该行业在地区:(%s)的平均薪酬为:%d' % (area, average_salary))
#分析要求
def require_analyse(df):
all_require = ''
for require in df['要求']:
if type(require) == float and math.isnan(require):
continue
all_require += require
_require_word_freq(all_require)
_require_word_cloud()
def _require_word_freq(all_require):
#设置用户词典
jieba.load_userdict(os.path.join(config.jieba_dir, "user_dict.txt"))
seg_lst = jieba.posseg.cut(all_require)
counter = Counter()
#设置停用词
stopwords_path = os.path.join(config.jieba_dir,"stopwords.txt" )
stopwords = [line.strip() for line in open(stopwords_path, "r", encoding="utf-8").readlines()]
for seg in seg_lst:
if seg.word in stopwords:
continue
#过滤符号
elif seg.flag == 'x':
continue
counter[seg.word] += 1
counter_sorted = sorted(counter.items(), key=lambda value: value[1], reverse=True)
with open(config.jobs_require_word_freq_path, "w+", encoding="utf-8") as f:
f_csv = csv.writer(f)
f_csv.writerows(counter_sorted)
print('词频文件保存成功,地址为:', config.jobs_require_word_freq_path)
def _require_word_cloud():
word_freq_dic = dict()
with open(config.jobs_require_word_freq_path, mode='r', encoding='utf-8') as f:
f_csv = csv.reader(f)
# print(f_csv)
for row in f_csv:
word_freq_dic[row[0]] = int(row[1])
# print(word_freq_dic)
#使用图片作为背景生成wordcloud
#这里用alice的图 是从这里得来的http://blog.csdn.net/fontthrone/article/details/72775865
# alice_coloring = np.array(Image.open(config.alice_png))
# wc = WordCloud(font_path=config.wc_font_path, background_color='white', mask = alice_coloring,
# max_words=150, max_font_size=100, min_font_size=20)\
# .generate_from_frequencies(word_freq_dic)
wc = WordCloud(font_path=config.wc_font_path,
max_words=150, height=800, width=1400).generate_from_frequencies(word_freq_dic)
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()
wc.to_file(config.wordcloud_png_path)
def start():
check_area_name()
fetch_data(web_type=WEBTYPE.all, keywords=['iOS'], area='深圳', page_count=5)
jobs_data_analyse()
start()
使用方法
打开项目文件jobs_data_analyse.py运行,可根据个人需求更改
运行后,就会开始收集数据。
收集完成后,会对收集来的薪酬数据简要分析。
最后会根据工作要求生成wordcloud。
为了方便按地区查看工作,笔者把工作简介放在jobs_data_introduce.csv,客官搜索自己要的地区进行查看。
这个demo只是符合笔者需要,仅供参考。