上周五工作:
- 使用手肘法测试top 1w query最佳聚类类别数,但是在这一数据中,sse斜率变化不大,分析的原因是数据太乱,无法找到最合适的类别数。在1000-1500类之间取1370类的时候sse曲线斜率变化最大,使用这一数据进行了测试,程序一直申请不到资源。
- 使用最朴素的dssm_v0算法进行query和title的关联,程序已经跑通,训练阶段完成,今天进行测试。
今天计划:
使用query top 1w和ugc 的title对训练得到的dssm_v0进行inference测试。
- query的加权embedding的query
拿到分词结果及各词权重
create table graph_embedding.hs_dssm_dic_query_2 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", "{weight}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0)a)b where lengthb(b.pair) > 0;
对分词结果进行embedding
create table hs_dssm_dic_query_3 as select id, word, weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_2;
embedding 结果加权平均得到词向量
create table graph_embedding.hs_tmp_160 as select id, graph_embedding:hs_merge_emb_14(weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_3 group by id;
- 得到inference的加权平均embedding
train_query : hs_dssm_dic_query_1 - | id | words_mainse_ids | se_keyword |
train_title : hs_dssm_dic_title_3 - | id | words_mainse_ids | title |
inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |
hs_dssm_dic_query_inf_1 -> he_tmp_162
create table graph_embedding.hs_dssm_dic_query_inf_2 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(query, "%", "{weight}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_query_inf_1 where lengthb(query) > 0)a)b where lengthb(b.pair) > 0;
create table hs_dssm_dic_query_inf_3 as select id, word, weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_inf_2;
create table graph_embedding.hs_tmp_162 as select id, graph_embedding:hs_merge_emb_14(weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_inf_3 group by id;
hs_dssm_dic_title_inf_1 -> hs_tmp_164
create table graph_embedding.hs_dssm_dic_title_inf_2 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(title, "%", "{weight}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_title_inf_1 where lengthb(title) > 0)a)b where lengthb(b.pair) > 0;
create table hs_dssm_dic_title_inf_3 as select id, word, weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_title_inf_2;
create table graph_embedding.hs_tmp_162 as select id, graph_embedding:hs_merge_emb_14(weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_inf_3 group by id;
得到inference数据
create table graph_embedding.hs_tmp_156 as
select c.query_id, c.title_id, c.query, d.emb as title from
(select a.*, b.emb as query from (select * from graph_embedding.hs_tmp_157)a left join (select * from graph_embedding.hs_tmp_162)b on a.query_id == b.id)c left join (select * from graph_embedding.hs_tmp_164)d on c.title_id == d.id;