昨天工作:
- 对dssm网络进行inference测试,效果还是不好,网络存在过拟合的问题。尝试使用early stop的方式进行解决,没有大的改善。尝试使用降低网络复杂度的方式,将attention替换成average pooling,网络训练还在进行。
- 发现inference中置信度高的结果中存在大量的query为空的结果,尝试去掉训练集中query为空的训练数据并重新训练测试,没有大的改善。
今天计划:
寻找改善dssm网络效果的方式。
- 使用alinlp的预训练dssm做测试:
set odps.sql.udf.jvm.memory=4096;
drop table hs_tmp_148;
yes
create table hs_tmp_148 as
select query, title, search_kg:alinlp_dssm_text_similarity(search_kg:alinlp_segment(query, 'MAINSE', ' '), search_kg:alinlp_segment(title, 'MAINSE', ' '),' ') as score from hs_tmp_131;
create table hs_tmp_149 as
select a.query, a.title, b.score from
(select row_number()over() as id, query, title from hs_tmp_131)a join (select row_number()over() as id, _c0 as score from hs_tmp_148)b on a.id == b.id;
- 先得到embedding,再算距离
drop table if exists graph_embedding.hs_origindssm_query_title_emb_;
yes
create table if not exists graph_embedding.hs_origindssm_query_title_emb_ LIFECYCLE 7
as select
query, title,
search_kg:alinlp_dssm_text_similarity(
search_kg:alinlp_segment(query, 'MAINSE', ' '), ' ') as query_emb,
search_kg:alinlp_dssm_text_similarity(
search_kg:alinlp_segment(title, 'MAINSE', ' '), ' ') as title_emb
from graph_embedding.hs_tmp_131;
- alinlp得到的结果
inference_query : hs_dssm_dic_query_inf_1 - | id | words_mainse_ids | query |
inference_title : hs_dssm_dic_title_inf_1 - | id | words_mainse_ids | title |
得到embedding向量
create table hs_tmp_150 as select id as node_id, search_kg:alinlp_dssm_text_similarity(
search_kg:alinlp_segment(query, 'MAINSE', ' '), ' ') as emb from hs_dssm_dic_query_inf_1;
create table hs_tmp_151 as select id as node_id, search_kg:alinlp_dssm_text_similarity(
search_kg:alinlp_segment(title, 'MAINSE', ' '), ' ') as emb from hs_dssm_dic_title_inf_1;
knn计算
drop table if exists graph_embedding.hs_tmp_143;
yes
create table if not exists graph_embedding.hs_tmp_143(
node_id bigint,
emb string
) LIFECYCLE 14;
PAI -name am_vsearch_nearest_neighbor_014 -project algo_market
-Dcluster="{"worker":{"count":40,"gpu":100}}"
-Ddim=64
-Did_col="node_id"
-Dvector_col="emb"
-Dinput_slice=40
-Dtopk=10
-Dnprob=512
-Dmetric="l2"
-Dinput="odps://graph_embedding/tables/hs_tmp_151"
-Dquery="odps://graph_embedding/tables/hs_tmp_150"
-Doutputs="odps://graph_embedding/tables/hs_tmp_143"
-DenableDynamicCluster=true -DmaxTrainingTimeInHour=60;
关联
drop table hs_tmp_144;
yes
create table hs_tmp_144 as select bi_udf:bi_split_value(node_id, emb, " ") as (query_id, title_id) from hs_tmp_143;
drop table hs_tmp_145;
yes
create table hs_tmp_145 as select graph_embedding:hs_split(query_id, title_id, ":") as (query_id, title_id, score) from hs_tmp_144;
drop table hs_tmp_146;
yes
create table hs_tmp_146 as
select c.query, d.title, c.score from
(select a.*, b.query from (select * from hs_tmp_145)a left join (select * from hs_dssm_dic_query_inf_1)b on a.query_id == b.id)c left join (select * from hs_dssm_dic_title_inf_1)d on c.title_id == d.id;
- 组会
围绕dssm进行,过拟合问题比较严重,使用了各种方式进行改善,但是效果不佳