Task3
水了一波,有点跟不上,看不懂的函数去查很费时间,只进行到了统计特征,磕磕绊绊,以下一些我不熟悉的代码
1.这么多划分方法真的有有效果嘛?缺一不可吗?(验证)
def get_d_change_rate_fea(df):
import math
import time
temp = df.copy()
# 以ship、time为主键进行排序
temp.sort_values(['ship', 'time'], ascending=True, inplace=True)
# 通过shift求相邻差异值,注意学习.shift(-1,1)的含义
temp['timenext'] = temp.groupby('ship')['time'].shift(-1)
temp['ynext'] = temp.groupby('ship')['y'].shift(-1)
temp['xnext'] = temp.groupby('ship')['x'].shift(-1)
# 将shift得到的差异量进行填充,为什么会有空值NaN?
# 因为shift的起始位置是没法比较的,故用空值来代替
temp['ynext'] = temp['ynext'].fillna(method='ffill')
temp['xnext'] = temp['xnext'].fillna(method='ffill')
# 这里笔者的理解是ynext/xnext,而不需要减去y和x,因为ynext和xnext本身就是偏移量了
temp['angle_next'] = (temp['ynext'] - temp['y']) / (temp['xnext'] - temp['x'])
temp['angle_next'] = np.arctan(temp['angle_next']) / math.pi * 180
temp['angle_next_next'] = temp['angle_next'].shift(-1)
temp['timediff'] = np.abs(temp['timenext'] - temp['time'])
temp['timediff'] = temp['timediff'].fillna(method='ffill')
temp['hc_xy'] = abs(temp['angle_next_next'] - temp['angle_next'])
# 对于hc_xy这列的值>180度的,进行修改成360度求差,仅考虑与水平线的角度
temp.loc[temp['hc_xy'] > 180, 'hc_xy'] = (360 - temp.loc[temp['hc_xy'] > 180, 'hc_xy'])
temp['hc_xy_s'] = temp.apply(lambda x: x['hc_xy'] / x['timediff'].total_seconds(), axis=1)
temp['d_next'] = temp.groupby('ship')['d'].shift(-1)
temp['hc_d'] = abs(temp['d_next'] - temp['d'])
temp.loc[temp['hc_d'] > 180, 'hc_d'] = 360 - temp.loc[temp['hc_d'] > 180, 'hc_d']
temp['hc_d_s'] = temp.apply(lambda x: x['hc_d'] / x['timediff'].total_seconds(), axis=1)
temp1 = temp[['ship', 'hc_xy_s', 'hc_d_s']]
xy_d_rate = temp1.groupby('ship')['hc_xy_s'].agg({'hc_xy_s_max': 'max',
})
xy_d_rate = xy_d_rate.reset_index()
d_d_rate = temp1.groupby('ship')['hc_d_s'].agg({'hc_d_s_max': 'max',
})
d_d_rate = d_d_rate.reset_index()
tmp = xy_d_rate.merge(d_d_rate, on='ship', how='left')
return tmp
c5 = get_d_change_rate_fea(temp)
2.分箱函数
pd.cut()
3.记住用法
x_bins = np.linspace(x_min, x_max, endpoint=True, num=col_bins + 1)
y_bins = np.linspace(y_min, y_max, endpoint=True, num=row_bins + 1)
4.没看懂的
pre_cols = df.columns
g = df.groupby('id')
for f in ['x', 'y']:
#对x,y坐标进行时间平移 1 -1 2
df[f + '_prev_diff'] = df[f] - g[f].shift(1)
df[f + '_next_diff'] = df[f] - g[f].shift(-1)
df[f + '_prev_next_diff'] = g[f].shift(1) - g[f].shift(-1)
## 三角形求解上时刻1距离 下时刻-1距离 2距离
df['dist_move_prev'] = np.sqrt(np.square(df['x_prev_diff']) + np.square(df['y_prev_diff']))
df['dist_move_next'] = np.sqrt(np.square(df['x_next_diff']) + np.square(df['y_next_diff']))
df['dist_move_prev_next'] = np.sqrt(np.square(df['x_prev_next_diff']) + np.square(df['y_prev_next_diff']))
df['dist_move_prev_bin'] = pd.qcut(df['dist_move_prev'], 50, duplicates='drop')# 2时刻距离等频分箱50
df['dist_move_prev_bin'] = df['dist_move_prev_bin'].map(
dict(zip(df['dist_move_prev_bin'].unique(), range(df['dist_move_prev_bin'].nunique())))
) #上一时刻映射编码
new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()
5.看了一些关于embedding的解读,直观有点感受,但是理解不深(难难难!!!)