C++11使用DuckDB+nlohmann::json分析XX直聘的招聘数据

本例使用C++11,使用DuckDB内存数据库模式,结合nlohmann::json分析Boss直聘的招聘数据,没有做可视化。主要是为自己学习的方向做一个校正,了解现在市场上都需要哪些技术类型的人才,因为太卷了。
这个用Python也能做,使用自带的json解析库和pandas的values_count就能做。但是使用duckdb也能很容易通过写SQL达到同样的效果。
数据的爬取过程需要到XX直聘网站进行爬取。
先用浏览器打开一个XX直聘网站的网页,然后使用Chrome/Edge Dev Tools工具抓取对应网站的拿取数据的URL。这里不贴出来了。
然后每次抓取30个job信息。使用json文件存下来。
放到指定目录,如下图。
注意名字里面最好带上对应的技术,到时候做过滤用。


image.png

然后我们就开始写代码了,本文的代码中还用到了Poco库来做HTTP请求,但是毫无疑问,XX直聘网站是不让用URL直接去爬接口的。所以没用上。
代码如下,
conanfile.txt

[requires]
boost/1.81.0
zlib/1.3.1
nlohmann_json/3.11.3
gumbo-parser/0.10.1
poco/1.13.3
duckdb/0.10.2


[generators]
cmake
[layout]
cmake_layout

CMakeLists.txt

cmake_minimum_required(VERSION 3.3)


project(54_boss_crawer)

set ( CMAKE_CXX_FLAGS "-pthread")
set(CMAKE_CXX_STANDARD 17)
add_definitions(-D _GLIBCXX_USE_CXX11_ABI=0)
add_definitions(-g)

include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
conan_basic_setup()


include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
link_directories(${CONAN_LIB_DIRS_POCO})

file( GLOB main_file_list ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) 
file( GLOB sources ${CMAKE_CURRENT_SOURCE_DIR}/src/utils/*.cc)

foreach( main_file ${main_file_list} )
    file(RELATIVE_PATH filename ${CMAKE_CURRENT_SOURCE_DIR} ${main_file})
    string(REPLACE ".cpp" "" file ${filename})
    add_executable(${file}  ${main_file} ${sources})
    target_link_libraries(${file} ${CONAN_LIBS} pthread)
endforeach( main_file ${main_file_list})

主文件boss_craw.cpp

#include <iostream>
#include "utils/data_extract.h"
#include "utils/data_analysis.h"

int main(int argc, char* argv[]) {
    if(argc != 3) {
        std::cerr << "Usage: ./boss_craw {json_dir} {filter_name}" << std::endl;
        std::cerr << "Example: ./boss_craw ./data qa" << std::endl;
        return EXIT_FAILURE;
    }

    std::string json_dir = argv[1];
    std::string filter_name = argv[2];

    data_extract_op_t extract_op {json_dir, filter_name};
    data_extract_t data_extract(extract_op);
    auto job_data = data_extract.do_extract();
    
    data_analysis_t data_analysis(job_data);
    data_analysis.do_analysis();
    std::cout << "Data analysis Done!" << std::endl;
    return EXIT_SUCCESS;
}

include/beans/beans.h

#ifndef _FREDRIC_BEANS_H_
#define _FREDRIC_BEANS_H_

#include <string>
#include <vector>

struct data_extract_op_t {
    std::string json_dir;
    std::string filter_name;
};

struct job_data_t {
    job_data_t(std::string const& job_name_, std::vector<std::string> skills_):
      job_name(std::move(job_name_)), skills(std::move(skills_)){}
    std::string job_name;
    std::vector<std::string> skills;
};
#endif

include/utils/data_analysis.h

#ifndef _FREDRIC_DATA_ANALYSIS_H_
#define _FREDRIC_DATA_ANALYSIS_H_

#include "duckdb.hpp"
#include "beans/beans.h"

struct data_analysis_t {
    data_analysis_t(std::vector<job_data_t> const& job_datas);
    void do_analysis();
private:
    void convert_to_db_data(std::vector<job_data_t> const& job_datas);
    duckdb::DuckDB db;
    duckdb::Connection conn;
};

#endif

src/utils/data_analysis.cc

#include "utils/data_analysis.h"
#include <sstream>

data_analysis_t::data_analysis_t(std::vector<job_data_t> const& job_datas):
    db(nullptr), conn(db) {
    convert_to_db_data(job_datas);
}

void data_analysis_t::do_analysis() {
    std::string skills_analysis_query = R"(copy(select skill, count(skill) as skill_count from
        (select unnest(skills) as skill  from job) group by skill order by skill_count desc) 
        to 'analysis.csv' (HEADER, DELIMITER ',');)";
    conn.Query(skills_analysis_query);
    
    std::string raw_result_query = R"(copy(select * from job) 
        to 'job.csv' (HEADER, DELIMITER ',');)";
    conn.Query(raw_result_query);
}

void data_analysis_t::convert_to_db_data(std::vector<job_data_t> const& job_datas) {
    conn.Query("CREATE TABLE job (job_name VARCHAR, skills VARCHAR[])");
    for(auto const& job_data: job_datas) {
        std::stringstream insert_job_data_ss;
        insert_job_data_ss << "INSERT INTO job VALUES('" << job_data.job_name << "', [";
        std::stringstream skill_ss;
        for(auto const& skill: job_data.skills) {
            skill_ss << "'";
            skill_ss << skill;
            skill_ss << "'";
            skill_ss << ",";
        }
        std::string skill_str = skill_ss.str();
        skill_str = skill_str.substr(0, skill_str.size() - 1);
        
        insert_job_data_ss << skill_str;
        insert_job_data_ss << "]);";
        conn.Query(insert_job_data_ss.str());
    }
}

include/utils/data_extract.h

#ifndef _FREDRIC_DATA_EXTRACT_H_
#define _FREDRIC_DATA_EXTRACT_H_

#include "nlohmann/json.hpp"
#include "boost/filesystem.hpp"
#include "beans/beans.h"
#include <string>
#include <vector>

using ordered_json = nlohmann::ordered_json;
namespace fs = boost::filesystem;


struct data_extract_t {
    data_extract_t(data_extract_op_t const& data_ex_op_);
    std::vector<job_data_t> do_extract();
private:
    std::vector<std::string> get_all_json_files();
    data_extract_op_t data_ex_op;
};
#endif

src/utils/data_extract.cc

#include "utils/data_extract.h"
#include "utils/funcs.h"

data_extract_t::data_extract_t(data_extract_op_t const& data_ex_op_):
    data_ex_op(data_ex_op_) {}

std::vector<std::string> data_extract_t::get_all_json_files() {
    std::vector<std::string> all_json_files;
    fs::directory_iterator dir_it(data_ex_op.json_dir);
    for(auto const& entry: dir_it) {
        if(entry.path().string().find(data_ex_op.filter_name) != std::string::npos) {
            all_json_files.push_back(entry.path().string());
        }
    }
    return all_json_files;
}

std::vector<job_data_t> data_extract_t::do_extract() {
    auto js_files = get_all_json_files();
    std::vector<job_data_t> result_jobs;

    for(auto const& js_file: js_files) {
        auto js_content = get_file_content(js_file);
        auto result_js = ordered_json::parse(js_content);
        auto& job_list_js = result_js["zpData"]["jobList"];
        for(auto const& job_js: job_list_js) {
            auto job_name = job_js["jobName"].get<std::string>();
            auto& skills_js = job_js["skills"];
            std::vector<std::string> skills;
            for(auto const& skill_js: skills_js) {
                skills.push_back(skill_js.get<std::string>());
            }
            result_jobs.emplace_back(job_name, skills);
        }
    }

    return result_jobs;
}

include/utils/funcs.h

#ifndef _FREDRIC_FUNCS_H_
#define _FREDRIC_FUNCS_H_

#include <string>

std::string get_file_content(std::string const& file_path);

void save_to_file(std::string const& file_path, std::string const& content);

#endif

src/utils/funcs.cc

#include "utils/funcs.h"
#include <fstream>
#include <sstream>
#include <string>

std::string get_file_content(std::string const& file_path) {
    std::ifstream ifs(file_path );
    std::stringstream file_ss;
    file_ss <<  ifs.rdbuf(); 
    std::string document = file_ss.str();
    ifs.close();
    return document;
}

void save_to_file(std::string const& file_path, std::string const& content) {
    std::fstream fs(file_path, std::ios::out);
    fs << content;
    fs.flush();
    fs.close();
}

include/utils/http_req.h

#ifndef _FREDRIC_HTTP_REQ_H_
#define _FREDRIC_HTTP_REQ_H_

#include <Poco/Net/HTTPClientSession.h>
#include <Poco/URI.h>
#include <Poco/Net/HTTPSClientSession.h>  
#include <Poco/Net/HTTPRequest.h>
#include <Poco/Net/HTTPResponse.h>
#include <Poco/Net/StreamSocket.h>
#include <Poco/StreamCopier.h>
#include <iostream>
#include <sstream>
#include <map>

/***
 * http_req_t is a https client, which is based on the Poco library,
 * The usage is as belows 
 * std::string resp;
    bool req_ok = http_req_t::send_http_request(req_type_t::get,
        "https://www.zhipin.com:443/wapi/zpgeek/search/joblist.json?scene=1&query=%E6%B5%8B%E8%AF%95%E5%BC%80%E5%8F%91&city=101020100&experience=&payType=&partTime=&degree=&industry=&scale=&stage=&position=&jobType=&salary=&multiBusinessDistrict=&multiSubway=&page=2&pageSize=30",
        {   {"Sec-Ch-Ua", R"("Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99")"},
            {"Cookie", "wd_guid=eeeb3491-7581-4c30-a605-ed20b5850811; historyState=state; _bl_uid=IUlqev2X67d21dd247U3n5vabwsb; lastCity=101020100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1713506343,1715906605; __zp_stoken__=ae3bfw44XwrEFPVpQB1cbRnsEw4hRbkrCvXlWSlFrwrVTTMOLwpNyaMK%2BX1DCsmPCm11MwqzDiXnCrcOJwptZwr9Ewp5NwqXCpcO6wp3CmsSfw7VNxKbEiMKvwrHCkD84DAwCDwAYGA4bBA8PDhsEGBgOGwQYGA4bBDoiw7YBMD9ISyBCRkYZRFVUQVpAD1pORj4xBmgbBjE7MUs%2BPcOIMcK1wpbCtDHCv8KTw4s9wr%2FDoUs2PTHCv0UtKsKxw5sHwr8UDcK8w7ghwr3Dm1rCskkBw4FrccKAwqHCscOaLzwzwrHEvj4%2FH0s8MkkwPEkyPy88EMObaHfCg8Ktw4vDhSAxHDo%2FMDc6Mj8wNTwwIzA0Szg%2FPSFIARsDDBslSMKxVcKxw5Q%2FMA%3D%3D; __c=1715906605; __l=l=%2Fwww.zhipin.com%2Fshanghai%2F&r=&g=&s=3&friend_source=0&s=3&friend_source=0; __a=96540295.1713170242.1713496020.1715906605.29.4.18.29; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1715914788; __zp_sname__=daff8e85; __zp_sseed__=qB2wV9miNIcDSbJGjo/3e4VMIYW92CvxTyauZ8P5cuQ=; __zp_sts__=1715922595735"},
            {"User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0"},
            {"Cache-Control", "max-age=0"},
            {"Connection", "keep-alive"},
            {"Host", "www.zhipin.com"},
            {"Accept-Encoding", "gzip, deflate, br, zstd"},
            {"Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"},
            {"Accept", "text/html"}
        },
        "",
        resp
        );

     bool req_ok_bing = http_req_t::send_http_request(req_type_t::get,
        "https://cn.bing.com:443/",
        {
            {"Accept", "text/html"},
            {"Content-Encoding", "gzip"}
        },
        "",
        resp
        );
    std::cout << resp << std::endl;
*/

enum req_type_t {
    get,
    post
};

struct http_req_t {
    static bool send_http_request(req_type_t req_type,
                                        std::string const& url,
                                        std::map<std::string, std::string> const& headers,
                                        std::string const& contents,
                                        std::string& resp_out);
};
#endif

src/utils/http_req.cc

#include "utils/http_req.h"

bool http_req_t::send_http_request(req_type_t req_type,
                                     std::string const& url,
                                     std::map<std::string, std::string> const& headers,
                                     std::string const& contents,
                                     std::string& resp_out) {
    try {
        Poco::URI uri(url); 
        Poco::Net::HTTPSClientSession session(uri.getHost(), uri.getPort());
        auto http_method = Poco::Net::HTTPRequest::HTTP_GET;
        if(req_type == req_type_t::post) {
            http_method = Poco::Net::HTTPRequest::HTTP_POST;
        }
        std::cout << uri.getPathAndQuery() << std::endl;
        Poco::Net::HTTPRequest request(http_method, uri.getPathAndQuery(), Poco::Net::HTTPMessage::HTTP_1_1);
        // 设置请求头
        for(auto const& header_pair: headers) {
            request.set(header_pair.first, header_pair.second);
        }

        std::ostream& ostr = session.sendRequest(request);  
        ostr << contents;

        // 接收响应  
        Poco::Net::HTTPResponse response;  
        std::istream& rs = session.receiveResponse(response);
        if(response.getStatus() != Poco::Net::HTTPResponse::HTTPStatus::HTTP_OK) {
            // 输出响应状态  
            std::cout << "Response status: " << response.getStatus() << " " << response.getReason() << std::endl; 
            resp_out = response.getReason();
            return false;
        }
        // 读取并输出响应体  
        std::ostringstream oss;  
        Poco::StreamCopier::copyStream(rs, oss);
        resp_out = oss.str();
        return true;
    } catch(Poco::Exception& ex) {
        std::cout << "Exception occurred: " << ex.displayText() << std::endl;
        resp_out = ex.displayText();
        return false;
    }
}

程序输出效果如下,


image.png

image.png
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 216,039评论 6 498
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 92,223评论 3 392
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 161,916评论 0 351
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,009评论 1 291
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,030评论 6 388
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,011评论 1 295
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,934评论 3 416
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,754评论 0 271
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,202评论 1 309
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,433评论 2 331
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,590评论 1 346
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,321评论 5 342
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,917评论 3 325
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,568评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,738评论 1 268
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 47,583评论 2 368
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,482评论 2 352

推荐阅读更多精彩内容