20190805工作进展

上周五工作：

使用手肘法测试top 1w query最佳聚类类别数，但是在这一数据中，sse斜率变化不大，分析的原因是数据太乱，无法找到最合适的类别数。在1000-1500类之间取1370类的时候sse曲线斜率变化最大，使用这一数据进行了测试，程序一直申请不到资源。
使用最朴素的dssm_v0算法进行query和title的关联，程序已经跑通，训练阶段完成，今天进行测试。

今天计划：
使用query top 1w和ugc 的title对训练得到的dssm_v0进行inference测试。

query的加权embedding的query

拿到分词结果及各词权重
create table graph_embedding.hs_dssm_dic_query_2 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(se_keyword, "%", " ${word}|$ {weight}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_query_1 where lengthb(se_keyword) > 0)a)b where lengthb(b.pair) > 0;

对分词结果进行embedding
create table hs_dssm_dic_query_3 as select id, word, weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_2;

embedding 结果加权平均得到词向量
create table graph_embedding.hs_tmp_160 as select id, graph_embedding:hs_merge_emb_14(weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_3 group by id;

得到inference的加权平均embedding

hs_dssm_dic_query_inf_1 -> he_tmp_162

create table graph_embedding.hs_dssm_dic_query_inf_2 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(query, "%", " ${word}|$ {weight}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_query_inf_1 where lengthb(query) > 0)a)b where lengthb(b.pair) > 0;

create table hs_dssm_dic_query_inf_3 as select id, word, weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_query_inf_2;

create table graph_embedding.hs_tmp_162 as select id, graph_embedding:hs_merge_emb_14(weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_inf_3 group by id;

hs_dssm_dic_title_inf_1 -> hs_tmp_164

create table graph_embedding.hs_dssm_dic_title_inf_2 as
select graph_embedding:hs_split_1(id, pair, "|") as (id, word, weight) from
(select bi_udf:bi_split_value(id, tag_result, "%") as (id, pair) from
(select id, search_kg:alinlp_termweight_ecom(title, "%", " ${word}|$ {weight}", 1, 1) as tag_result from graph_embedding.hs_dssm_dic_title_inf_1 where lengthb(title) > 0)a)b where lengthb(b.pair) > 0;

create table hs_dssm_dic_title_inf_3 as select id, word, weight, search_kg:alinlp_word_embedding(word, "100", "CONTENT_SEARCH") as word_emb from hs_dssm_dic_title_inf_2;

create table graph_embedding.hs_tmp_162 as select id, graph_embedding:hs_merge_emb_14(weight, word_emb) as emb from graph_embedding.hs_dssm_dic_query_inf_3 group by id;

得到inference数据

create table graph_embedding.hs_tmp_156 as
select c.query_id, c.title_id, c.query, d.emb as title from
(select a.*, b.emb as query from (select * from graph_embedding.hs_tmp_157)a left join (select * from graph_embedding.hs_tmp_162)b on a.query_id == b.id)c left join (select * from graph_embedding.hs_tmp_164)d on c.title_id == d.id;

20190805工作进展

推荐阅读更多精彩内容