_128 _mm_fmadd_ps(_128 a, _128 b, _128 c); return a*b + c; __m128 _mm_broadcastss_ps (__m128 a) return vdupq_lane_f32(vget_low_f32(in), 0);