本例使用C++11,使用DuckDB内存数据库模式,结合nlohmann::json分析Boss直聘的招聘数据,没有做可视化。主要是为自己学习的方向做一个校正,了解现在市场上都需要哪些技术类型的人才,因为太卷了。
这个用Python也能做,使用自带的json解析库和pandas的values_count就能做。但是使用duckdb也能很容易通过写SQL达到同样的效果。
数据的爬取过程需要到XX直聘网站进行爬取。
先用浏览器打开一个XX直聘网站的网页,然后使用Chrome/Edge Dev Tools工具抓取对应网站的拿取数据的URL。这里不贴出来了。
然后每次抓取30个job信息。使用json文件存下来。
放到指定目录,如下图。
注意名字里面最好带上对应的技术,到时候做过滤用。
然后我们就开始写代码了,本文的代码中还用到了Poco库来做HTTP请求,但是毫无疑问,XX直聘网站是不让用URL直接去爬接口的。所以没用上。
代码如下,
conanfile.txt
[requires]
boost/1.81.0
zlib/1.3.1
nlohmann_json/3.11.3
gumbo-parser/0.10.1
poco/1.13.3
duckdb/0.10.2
[generators]
cmake
[layout]
cmake_layout
CMakeLists.txt
cmake_minimum_required(VERSION 3.3)
project(54_boss_crawer)
set ( CMAKE_CXX_FLAGS "-pthread")
set(CMAKE_CXX_STANDARD 17)
add_definitions(-D _GLIBCXX_USE_CXX11_ABI=0)
add_definitions(-g)
include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
conan_basic_setup()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
link_directories(${CONAN_LIB_DIRS_POCO})
file( GLOB main_file_list ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
file( GLOB sources ${CMAKE_CURRENT_SOURCE_DIR}/src/utils/*.cc)
foreach( main_file ${main_file_list} )
file(RELATIVE_PATH filename ${CMAKE_CURRENT_SOURCE_DIR} ${main_file})
string(REPLACE ".cpp" "" file ${filename})
add_executable(${file} ${main_file} ${sources})
target_link_libraries(${file} ${CONAN_LIBS} pthread)
endforeach( main_file ${main_file_list})
主文件boss_craw.cpp
#include <iostream>
#include "utils/data_extract.h"
#include "utils/data_analysis.h"
int main(int argc, char* argv[]) {
if(argc != 3) {
std::cerr << "Usage: ./boss_craw {json_dir} {filter_name}" << std::endl;
std::cerr << "Example: ./boss_craw ./data qa" << std::endl;
return EXIT_FAILURE;
}
std::string json_dir = argv[1];
std::string filter_name = argv[2];
data_extract_op_t extract_op {json_dir, filter_name};
data_extract_t data_extract(extract_op);
auto job_data = data_extract.do_extract();
data_analysis_t data_analysis(job_data);
data_analysis.do_analysis();
std::cout << "Data analysis Done!" << std::endl;
return EXIT_SUCCESS;
}
include/beans/beans.h
#ifndef _FREDRIC_BEANS_H_
#define _FREDRIC_BEANS_H_
#include <string>
#include <vector>
struct data_extract_op_t {
std::string json_dir;
std::string filter_name;
};
struct job_data_t {
job_data_t(std::string const& job_name_, std::vector<std::string> skills_):
job_name(std::move(job_name_)), skills(std::move(skills_)){}
std::string job_name;
std::vector<std::string> skills;
};
#endif
include/utils/data_analysis.h
#ifndef _FREDRIC_DATA_ANALYSIS_H_
#define _FREDRIC_DATA_ANALYSIS_H_
#include "duckdb.hpp"
#include "beans/beans.h"
struct data_analysis_t {
data_analysis_t(std::vector<job_data_t> const& job_datas);
void do_analysis();
private:
void convert_to_db_data(std::vector<job_data_t> const& job_datas);
duckdb::DuckDB db;
duckdb::Connection conn;
};
#endif
src/utils/data_analysis.cc
#include "utils/data_analysis.h"
#include <sstream>
data_analysis_t::data_analysis_t(std::vector<job_data_t> const& job_datas):
db(nullptr), conn(db) {
convert_to_db_data(job_datas);
}
void data_analysis_t::do_analysis() {
std::string skills_analysis_query = R"(copy(select skill, count(skill) as skill_count from
(select unnest(skills) as skill from job) group by skill order by skill_count desc)
to 'analysis.csv' (HEADER, DELIMITER ',');)";
conn.Query(skills_analysis_query);
std::string raw_result_query = R"(copy(select * from job)
to 'job.csv' (HEADER, DELIMITER ',');)";
conn.Query(raw_result_query);
}
void data_analysis_t::convert_to_db_data(std::vector<job_data_t> const& job_datas) {
conn.Query("CREATE TABLE job (job_name VARCHAR, skills VARCHAR[])");
for(auto const& job_data: job_datas) {
std::stringstream insert_job_data_ss;
insert_job_data_ss << "INSERT INTO job VALUES('" << job_data.job_name << "', [";
std::stringstream skill_ss;
for(auto const& skill: job_data.skills) {
skill_ss << "'";
skill_ss << skill;
skill_ss << "'";
skill_ss << ",";
}
std::string skill_str = skill_ss.str();
skill_str = skill_str.substr(0, skill_str.size() - 1);
insert_job_data_ss << skill_str;
insert_job_data_ss << "]);";
conn.Query(insert_job_data_ss.str());
}
}
include/utils/data_extract.h
#ifndef _FREDRIC_DATA_EXTRACT_H_
#define _FREDRIC_DATA_EXTRACT_H_
#include "nlohmann/json.hpp"
#include "boost/filesystem.hpp"
#include "beans/beans.h"
#include <string>
#include <vector>
using ordered_json = nlohmann::ordered_json;
namespace fs = boost::filesystem;
struct data_extract_t {
data_extract_t(data_extract_op_t const& data_ex_op_);
std::vector<job_data_t> do_extract();
private:
std::vector<std::string> get_all_json_files();
data_extract_op_t data_ex_op;
};
#endif
src/utils/data_extract.cc
#include "utils/data_extract.h"
#include "utils/funcs.h"
data_extract_t::data_extract_t(data_extract_op_t const& data_ex_op_):
data_ex_op(data_ex_op_) {}
std::vector<std::string> data_extract_t::get_all_json_files() {
std::vector<std::string> all_json_files;
fs::directory_iterator dir_it(data_ex_op.json_dir);
for(auto const& entry: dir_it) {
if(entry.path().string().find(data_ex_op.filter_name) != std::string::npos) {
all_json_files.push_back(entry.path().string());
}
}
return all_json_files;
}
std::vector<job_data_t> data_extract_t::do_extract() {
auto js_files = get_all_json_files();
std::vector<job_data_t> result_jobs;
for(auto const& js_file: js_files) {
auto js_content = get_file_content(js_file);
auto result_js = ordered_json::parse(js_content);
auto& job_list_js = result_js["zpData"]["jobList"];
for(auto const& job_js: job_list_js) {
auto job_name = job_js["jobName"].get<std::string>();
auto& skills_js = job_js["skills"];
std::vector<std::string> skills;
for(auto const& skill_js: skills_js) {
skills.push_back(skill_js.get<std::string>());
}
result_jobs.emplace_back(job_name, skills);
}
}
return result_jobs;
}
include/utils/funcs.h
#ifndef _FREDRIC_FUNCS_H_
#define _FREDRIC_FUNCS_H_
#include <string>
std::string get_file_content(std::string const& file_path);
void save_to_file(std::string const& file_path, std::string const& content);
#endif
src/utils/funcs.cc
#include "utils/funcs.h"
#include <fstream>
#include <sstream>
#include <string>
std::string get_file_content(std::string const& file_path) {
std::ifstream ifs(file_path );
std::stringstream file_ss;
file_ss << ifs.rdbuf();
std::string document = file_ss.str();
ifs.close();
return document;
}
void save_to_file(std::string const& file_path, std::string const& content) {
std::fstream fs(file_path, std::ios::out);
fs << content;
fs.flush();
fs.close();
}
include/utils/http_req.h
#ifndef _FREDRIC_HTTP_REQ_H_
#define _FREDRIC_HTTP_REQ_H_
#include <Poco/Net/HTTPClientSession.h>
#include <Poco/URI.h>
#include <Poco/Net/HTTPSClientSession.h>
#include <Poco/Net/HTTPRequest.h>
#include <Poco/Net/HTTPResponse.h>
#include <Poco/Net/StreamSocket.h>
#include <Poco/StreamCopier.h>
#include <iostream>
#include <sstream>
#include <map>
/***
* http_req_t is a https client, which is based on the Poco library,
* The usage is as belows
* std::string resp;
bool req_ok = http_req_t::send_http_request(req_type_t::get,
"https://www.zhipin.com:443/wapi/zpgeek/search/joblist.json?scene=1&query=%E6%B5%8B%E8%AF%95%E5%BC%80%E5%8F%91&city=101020100&experience=&payType=&partTime=°ree=&industry=&scale=&stage=&position=&jobType=&salary=&multiBusinessDistrict=&multiSubway=&page=2&pageSize=30",
{ {"Sec-Ch-Ua", R"("Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99")"},
{"Cookie", "wd_guid=eeeb3491-7581-4c30-a605-ed20b5850811; historyState=state; _bl_uid=IUlqev2X67d21dd247U3n5vabwsb; lastCity=101020100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1713506343,1715906605; __zp_stoken__=ae3bfw44XwrEFPVpQB1cbRnsEw4hRbkrCvXlWSlFrwrVTTMOLwpNyaMK%2BX1DCsmPCm11MwqzDiXnCrcOJwptZwr9Ewp5NwqXCpcO6wp3CmsSfw7VNxKbEiMKvwrHCkD84DAwCDwAYGA4bBA8PDhsEGBgOGwQYGA4bBDoiw7YBMD9ISyBCRkYZRFVUQVpAD1pORj4xBmgbBjE7MUs%2BPcOIMcK1wpbCtDHCv8KTw4s9wr%2FDoUs2PTHCv0UtKsKxw5sHwr8UDcK8w7ghwr3Dm1rCskkBw4FrccKAwqHCscOaLzwzwrHEvj4%2FH0s8MkkwPEkyPy88EMObaHfCg8Ktw4vDhSAxHDo%2FMDc6Mj8wNTwwIzA0Szg%2FPSFIARsDDBslSMKxVcKxw5Q%2FMA%3D%3D; __c=1715906605; __l=l=%2Fwww.zhipin.com%2Fshanghai%2F&r=&g=&s=3&friend_source=0&s=3&friend_source=0; __a=96540295.1713170242.1713496020.1715906605.29.4.18.29; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1715914788; __zp_sname__=daff8e85; __zp_sseed__=qB2wV9miNIcDSbJGjo/3e4VMIYW92CvxTyauZ8P5cuQ=; __zp_sts__=1715922595735"},
{"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0"},
{"Cache-Control", "max-age=0"},
{"Connection", "keep-alive"},
{"Host", "www.zhipin.com"},
{"Accept-Encoding", "gzip, deflate, br, zstd"},
{"Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"},
{"Accept", "text/html"}
},
"",
resp
);
bool req_ok_bing = http_req_t::send_http_request(req_type_t::get,
"https://cn.bing.com:443/",
{
{"Accept", "text/html"},
{"Content-Encoding", "gzip"}
},
"",
resp
);
std::cout << resp << std::endl;
*/
enum req_type_t {
get,
post
};
struct http_req_t {
static bool send_http_request(req_type_t req_type,
std::string const& url,
std::map<std::string, std::string> const& headers,
std::string const& contents,
std::string& resp_out);
};
#endif
src/utils/http_req.cc
#include "utils/http_req.h"
bool http_req_t::send_http_request(req_type_t req_type,
std::string const& url,
std::map<std::string, std::string> const& headers,
std::string const& contents,
std::string& resp_out) {
try {
Poco::URI uri(url);
Poco::Net::HTTPSClientSession session(uri.getHost(), uri.getPort());
auto http_method = Poco::Net::HTTPRequest::HTTP_GET;
if(req_type == req_type_t::post) {
http_method = Poco::Net::HTTPRequest::HTTP_POST;
}
std::cout << uri.getPathAndQuery() << std::endl;
Poco::Net::HTTPRequest request(http_method, uri.getPathAndQuery(), Poco::Net::HTTPMessage::HTTP_1_1);
// 设置请求头
for(auto const& header_pair: headers) {
request.set(header_pair.first, header_pair.second);
}
std::ostream& ostr = session.sendRequest(request);
ostr << contents;
// 接收响应
Poco::Net::HTTPResponse response;
std::istream& rs = session.receiveResponse(response);
if(response.getStatus() != Poco::Net::HTTPResponse::HTTPStatus::HTTP_OK) {
// 输出响应状态
std::cout << "Response status: " << response.getStatus() << " " << response.getReason() << std::endl;
resp_out = response.getReason();
return false;
}
// 读取并输出响应体
std::ostringstream oss;
Poco::StreamCopier::copyStream(rs, oss);
resp_out = oss.str();
return true;
} catch(Poco::Exception& ex) {
std::cout << "Exception occurred: " << ex.displayText() << std::endl;
resp_out = ex.displayText();
return false;
}
}
程序输出效果如下,