import re
import time
from urllib.parse import urlencode
import requests
from lxml import etree
import pymongo
base_url = 'https://www.shixiseng.com/interns/c-310100_?'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
client = pymongo.MongoClient(host='localhost', port=27017)
db = client['job']
def get_page(keyword, page):
data = {
'k': keyword,
'p': page
}
url = base_url + urlencode(data)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
except Exception as e:
print(e)
return None
mapping = {
'\uf78f': '0', '\uea9b': '1', '\ue801': '2', '\ue170': '3', '\uf648': '4',
'\uef30': '5', '\ue0ca': '6', '\uf8cb': '7', '\uf2fb': '8', '\ue5c2': '9'
}
def decrypt_text(text):
for key, value in mapping.items():
text = text.replace(key, value)
return text
def parse_page(html):
tree = etree.HTML(html)
urls = tree.xpath('//ul[@class="position-list"]//a[@class="name"]//@href')
for url in urls:
parse_detail(url)
time.sleep(2)
def save_to_mongo(data):
db['shixiseng'].update({'id': data['id']}, data, True)
def parse_detail(url):
url_join = base_url.rstrip('/interns/c-310100_?') + url
print('当前解析url: ', url_join)
html = requests.get(url_join, headers=headers).text
tree = etree.HTML(html)
job_name = tree.xpath('//div[@class="new_job_name"]//text()')[0]
publish_time = tree.xpath('//span[@class="cutom_font"]//text()')[0]
publish_time = decrypt_text(publish_time)
job_money = tree.xpath('//span[contains(@class, "job_money")]//text()')[0]
job_money = decrypt_text(job_money)
job_position = tree.xpath('//span[@class="job_position"]/@title')[0]
job_week = tree.xpath('//span[@class="job_week cutom_font"]//text()')[0]
job_week = decrypt_text(job_week)
job_time = tree.xpath('//span[@class="job_time cutom_font"]//text()')[0]
job_time = decrypt_text(job_time)
job_good = tree.xpath('//div[@class="job_good"]//text()')[0]
job_detail = ' '.join(tree.xpath('//div[@class="job_part"]/div//text()'))
company_name = tree.xpath('//div[@class="job_com_name cutom_font"]//text()')[0]
company_detail = tree.xpath('//div[@class="con-job con-com_introduce"]/div[3]//text()')[0]
company_persons = tree.xpath('//div[@class="job_detail job_detail_msg"]/span[2]//text()')[0]
company_sector = tree.xpath('//div[@class="job_detail job_detail_msg"]/span[3]//text()')[0]
company_position = tree.xpath('//span[@class="com_position"]//text()')[0]
end_time = tree.xpath('//div[@class="con-job deadline"]//div[@class="job_detail cutom_font"]//text()')[0]
end_time = decrypt_text(end_time)
data = {
'job_name': job_name,
'publish_time': publish_time,
'job_money': job_money,
'job_position': job_position,
'job_week': job_week,
'job_time': job_time,
'job_good': job_good,
'job_detail': job_detail,
'company_name': company_name,
'company_detail': company_detail,
'company_persons': company_persons,
'company_sector': company_sector,
'company_position': company_position,
'end_time': end_time,
'id': company_name + publish_time
}
save_to_mongo(data)
def main(keyword):
html = get_page(keyword, 1)
if html:
# 解析第一页
parse_page(html)
# 翻页
tree = etree.HTML(html)
page_num = tree.xpath('//div[@id="pagebar"]//li[@class="active"]/a/@title')
if len(page_num) > 0: # 需要翻页
condition = re.search('共(.*?)页', page_num[0])
page_num = condition.group(1) if condition else 1
for page in range(2, int(page_num) + 1):
print('正在抓取第' + str(page) + '页...')
html = get_page(keyword, page)
parse_page(html)
if __name__ == '__main__':
main(keyword='爬虫')
实习僧爬虫
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 一:前言 继续练习Scrapy框架,这次抓取的是实习僧网最新的招聘信息,包括招聘岗位,时间,工资,学历要求,职位诱...
- 《大学生求职类公众号》内容分析 超级offer寺:是一个正在孵化的大学生求职类公众号,随笔写写,通过分析大学生求职...