2021-04-21

Task3

水了一波，有点跟不上，看不懂的函数去查很费时间，只进行到了统计特征，磕磕绊绊，以下一些我不熟悉的代码

1.这么多划分方法真的有有效果嘛？缺一不可吗？（验证）

def get_d_change_rate_fea(df):
import math
import time
temp = df.copy()
# 以ship、time为主键进行排序
temp.sort_values(['ship', 'time'], ascending=True, inplace=True)
# 通过shift求相邻差异值，注意学习.shift(-1,1)的含义
temp['timenext'] = temp.groupby('ship')['time'].shift(-1)
temp['ynext'] = temp.groupby('ship')['y'].shift(-1)
temp['xnext'] = temp.groupby('ship')['x'].shift(-1)
# 将shift得到的差异量进行填充，为什么会有空值NaN？
# 因为shift的起始位置是没法比较的，故用空值来代替
temp['ynext'] = temp['ynext'].fillna(method='ffill')
temp['xnext'] = temp['xnext'].fillna(method='ffill')
# 这里笔者的理解是ynext/xnext，而不需要减去y和x，因为ynext和xnext本身就是偏移量了
temp['angle_next'] = (temp['ynext'] - temp['y']) / (temp['xnext'] - temp['x'])
temp['angle_next'] = np.arctan(temp['angle_next']) / math.pi * 180
temp['angle_next_next'] = temp['angle_next'].shift(-1)
temp['timediff'] = np.abs(temp['timenext'] - temp['time'])
temp['timediff'] = temp['timediff'].fillna(method='ffill')
temp['hc_xy'] = abs(temp['angle_next_next'] - temp['angle_next'])
# 对于hc_xy这列的值>180度的，进行修改成360度求差，仅考虑与水平线的角度
temp.loc[temp['hc_xy'] > 180, 'hc_xy'] = (360 - temp.loc[temp['hc_xy'] > 180, 'hc_xy'])
temp['hc_xy_s'] = temp.apply(lambda x: x['hc_xy'] / x['timediff'].total_seconds(), axis=1)

temp['d_next'] = temp.groupby('ship')['d'].shift(-1)
temp['hc_d'] = abs(temp['d_next'] - temp['d'])
temp.loc[temp['hc_d'] > 180, 'hc_d'] = 360 - temp.loc[temp['hc_d'] > 180, 'hc_d']
temp['hc_d_s'] = temp.apply(lambda x: x['hc_d'] / x['timediff'].total_seconds(), axis=1)

temp1 = temp[['ship', 'hc_xy_s', 'hc_d_s']]
xy_d_rate = temp1.groupby('ship')['hc_xy_s'].agg({'hc_xy_s_max': 'max',
                                                  })
xy_d_rate = xy_d_rate.reset_index()
d_d_rate = temp1.groupby('ship')['hc_d_s'].agg({'hc_d_s_max': 'max',
                                                })
d_d_rate = d_d_rate.reset_index()

tmp = xy_d_rate.merge(d_d_rate, on='ship', how='left')
return tmp

c5 = get_d_change_rate_fea(temp)

2.分箱函数
pd.cut()
3.记住用法
x_bins = np.linspace(x_min, x_max, endpoint=True, num=col_bins + 1)
y_bins = np.linspace(y_min, y_max, endpoint=True, num=row_bins + 1)
4.没看懂的
pre_cols = df.columns

g = df.groupby('id')
for f in ['x', 'y']:
#对x,y坐标进行时间平移 1 -1 2
df[f + '_prev_diff'] = df[f] - g[f].shift(1)
df[f + '_next_diff'] = df[f] - g[f].shift(-1)
df[f + '_prev_next_diff'] = g[f].shift(1) - g[f].shift(-1)
## 三角形求解上时刻1距离下时刻-1距离 2距离
df['dist_move_prev'] = np.sqrt(np.square(df['x_prev_diff']) + np.square(df['y_prev_diff']))
df['dist_move_next'] = np.sqrt(np.square(df['x_next_diff']) + np.square(df['y_next_diff']))
df['dist_move_prev_next'] = np.sqrt(np.square(df['x_prev_next_diff']) + np.square(df['y_prev_next_diff']))
df['dist_move_prev_bin'] = pd.qcut(df['dist_move_prev'], 50, duplicates='drop')# 2时刻距离等频分箱50
df['dist_move_prev_bin'] = df['dist_move_prev_bin'].map(
dict(zip(df['dist_move_prev_bin'].unique(), range(df['dist_move_prev_bin'].nunique())))
) #上一时刻映射编码

new_cols = [i for i in df.columns if i not in pre_cols]
df[new_cols].head()
5.看了一些关于embedding的解读，直观有点感受，但是理解不深（难难难！！！）

2021-04-21

Task3

推荐阅读更多精彩内容