20190801工作进展

昨天工作：

对dssm网络进行inference测试，效果还是不好，网络存在过拟合的问题。尝试使用early stop的方式进行解决，没有大的改善。尝试使用降低网络复杂度的方式，将attention替换成average pooling，网络训练还在进行。
发现inference中置信度高的结果中存在大量的query为空的结果，尝试去掉训练集中query为空的训练数据并重新训练测试，没有大的改善。

今天计划：
寻找改善dssm网络效果的方式。

使用alinlp的预训练dssm做测试：

set odps.sql.udf.jvm.memory=4096;

drop table hs_tmp_148;
yes
create table hs_tmp_148 as
select query, title, search_kg:alinlp_dssm_text_similarity(search_kg:alinlp_segment(query, 'MAINSE', ' '), search_kg:alinlp_segment(title, 'MAINSE', ' '),' ') as score from hs_tmp_131;

create table hs_tmp_149 as
select a.query, a.title, b.score from
(select row_number()over() as id, query, title from hs_tmp_131)a join (select row_number()over() as id, _c0 as score from hs_tmp_148)b on a.id == b.id;

先得到embedding，再算距离

drop table if exists graph_embedding.hs_origindssm_query_title_emb_;
yes
create table if not exists graph_embedding.hs_origindssm_query_title_emb_ LIFECYCLE 7
as select
query, title,
search_kg:alinlp_dssm_text_similarity(
search_kg:alinlp_segment(query, 'MAINSE', ' '), ' ') as query_emb,
search_kg:alinlp_dssm_text_similarity(
search_kg:alinlp_segment(title, 'MAINSE', ' '), ' ') as title_emb
from graph_embedding.hs_tmp_131;

alinlp得到的结果

得到embedding向量

create table hs_tmp_150 as select id as node_id, search_kg:alinlp_dssm_text_similarity(
search_kg:alinlp_segment(query, 'MAINSE', ' '), ' ') as emb from hs_dssm_dic_query_inf_1;

create table hs_tmp_151 as select id as node_id, search_kg:alinlp_dssm_text_similarity(
search_kg:alinlp_segment(title, 'MAINSE', ' '), ' ') as emb from hs_dssm_dic_title_inf_1;

knn计算

drop table if exists graph_embedding.hs_tmp_143;
yes
create table if not exists graph_embedding.hs_tmp_143(
node_id bigint,
emb string
) LIFECYCLE 14;

PAI -name am_vsearch_nearest_neighbor_014 -project algo_market
-Dcluster="{"worker":{"count":40,"gpu":100}}"
-Ddim=64
-Did_col="node_id"
-Dvector_col="emb"
-Dinput_slice=40
-Dtopk=10
-Dnprob=512
-Dmetric="l2"
-Dinput="odps://graph_embedding/tables/hs_tmp_151"
-Dquery="odps://graph_embedding/tables/hs_tmp_150"
-Doutputs="odps://graph_embedding/tables/hs_tmp_143"
-DenableDynamicCluster=true -DmaxTrainingTimeInHour=60;

关联

drop table hs_tmp_144;
yes
create table hs_tmp_144 as select bi_udf:bi_split_value(node_id, emb, " ") as (query_id, title_id) from hs_tmp_143;

drop table hs_tmp_145;
yes
create table hs_tmp_145 as select graph_embedding:hs_split(query_id, title_id, ":") as (query_id, title_id, score) from hs_tmp_144;

drop table hs_tmp_146;
yes
create table hs_tmp_146 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_tmp_145)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_inf_1)d on c.title_id == d.id;

组会

围绕dssm进行，过拟合问题比较严重，使用了各种方式进行改善，但是效果不佳

全零

最后编辑于：2019.08.01 13:34:32