按user_id分组取click_times最大值对应的行(只有一条记录)
df[df['user_id']==1].sort_values('click_times',ascending=False).groupby('user_id', as_index=False).first()
创建空同类型dataframe
df2 = pd.DataFrame(data=None, columns=df1.columns, index=df1.index)
df.columns.values.tolist() # 列名称
x_train = x_train['ad_id'].to_frame().astype('int')
df_train = df_click.merge(df_user, on='user_id', how='inner').groupby(['user_id', 'ad_id'], as_index=False).agg({'click_times': 'sum', 'gender': 'max'})
训练表
x = df1.groupby('user_id').apply(lambda x: (x['creative_id']*x['click_times']).sum()).to_frame() # group by 两字段相乘后再sum,从series转dataframe
预测结果写csv
result = pd.DataFrame({'user_id': df_user_click['user_id'], 'predicted_age': 3, 'predicted_gender': prediction}) # ndarray
result.to_csv('submission.csv', index=False)
pyhive
conn = hive.Connection(host='10.201.48.141', port=10000, username='hadoop', database='dm')
cursor = conn.cursor()
sql="select site_tp,goods_sn,eval_id, eval_cont, eval_cont_2 \
From dm.dm_svc_goods_evaluate_di a where site_tp='shein' and dt='"+dt+"' " \
" and size(split(eval_cont_2,' '))>=2 " + limit_size
cursor.execute(sql)
k折目标编码
# 创建数据
data = pd.DataFrame({'feature': ['a', 'a', 'a', 'b', 'a', 'a', 'a', 'b', 'b', 'b'],
'label': [0, 1, 0, 0, 1, 0, 0, 0, 0, 1]})
# 获得标签列 series
labels = data.label
# 删除标签列
data.drop('label', axis=1, inplace=True)
# 存放目标编码后的数据
df = pd.DataFrame(columns=['feature', 'label'])
# 5折目标编码
kf = KFold(n_splits=5, shuffle=False)
for train_idx, test_idx in kf.split(data):
print(train_idx, test_idx) # ndarray [2 3 4 5 6 7 8 9] [0 1]
te = TargetEncoder(cols='feature').fit(data.iloc[train_idx], labels[train_idx])
to = te.transform(data.iloc[test_idx])
df = df.append(to) # 纵向拼接dataframe
print(df)