import sys
import pickle
import numpy as np
import pandas as pd
sys.path.insert(0, '/Users/puhaoran/Desktop/code/algpro/rcpro/eva')
def get_debug_sample(sample_nums=10000):
""" 训练集中采样 """
train_df = pd.read_csv('./inputs/train_click_log.csv')
train_user_ids = train_df.user_id.unique()
sample_user_ids = np.random.choice(train_user_ids, size=sample_nums, replace=False)
all_click_df = train_df[train_df.user_id.isin(sample_user_ids)]
all_click_df = all_click_df.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
return all_click_df
def get_train_df(strategy='local'):
""" 线上策略或线下策略 """
if strategy == 'local':
all_click_df = pd.read_csv('./inputs/train_click_log.csv')
else:
train_df = pd.read_csv('./inputs/train_click_log.csv')
test_df = pd.read_csv('./inputs/testA_click_log.csv')
all_click_df = train_df.append(test_df)
all_click_df = all_click_df.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
return all_click_df
def get_item_info():
""" 读取物品数据 """
item_df = pd.read_csv('./inputs/articles.csv')
item_df = item_df.rename(columns={'article_id': 'click_article_id'})
return item_df
def get_emb_dict():
""" 获取embedding数据 """
item_emb_df = pd.read_csv('./inputs/articles_emb.csv')
cols = [x for x in item_emb_df.columns if 'emb' in x]
t = np.ascontiguousarray(item_emb_df[cols])
# 归一化
t = t / np.linalg.norm(t, axis=1, keepdims=True)
item_emb_dict = dict(zip(item_emb_df['article_id'], t))
pickle.dump(item_emb_dict, open('./inputs/item_emb.pkl', 'wb'))
return item_emb_dict
data = get_debug_sample()
min_max_scaler = lambda x: (x-np.min(x)) / (np.max(x)- np.min(x))
# 对时间戳归一化,用于关联规则时计算权重
data['click_timestamp'] = data[['click_timestamp']].apply(min_max_scaler)
item_df = get_item_info()
get_emb_dict = get_emb_dict()
##### 定义多路召回字典
multiple_recall_dict = {
'cold_boot': {},
'itemcf_recall': {},
'emb_recall': {},
'youtubednn_recall': {},
}
阿斯蒂芬