Skip to content

Commit 2db3016

Browse files
authored
Merge pull request #462 from howjmay/vmls_lane
feat: Add vmls[q]_lane[q]_[s16|s32|u16|u32|f32]
2 parents bd7fc58 + 8710c04 commit 2db3016

File tree

3 files changed

+338
-58
lines changed

3 files changed

+338
-58
lines changed

neon2rvv.h

Lines changed: 68 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -9061,93 +9061,123 @@ FORCE_INLINE int64x2_t vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_
90619061
return __riscv_vadd_vv_i64m1(a, bc_mulx2, 2);
90629062
}
90639063

9064-
FORCE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c, const int __d) {
9065-
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 4);
9064+
FORCE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c, const int lane) {
9065+
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
90669066
return __riscv_vnmsac_vv_i16m1(a, b, c_dup, 4);
90679067
}
90689068

9069-
FORCE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c, const int __d) {
9070-
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 2);
9069+
FORCE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c, const int lane) {
9070+
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
90719071
return __riscv_vnmsac_vv_i32m1(a, b, c_dup, 2);
90729072
}
90739073

9074-
FORCE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c, const int __d) {
9075-
vfloat32m1_t c_dup = __riscv_vrgather_vx_f32m1(c, __d, 2);
9074+
FORCE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c, const int lane) {
9075+
vfloat32m1_t c_dup = __riscv_vrgather_vx_f32m1(c, lane, 2);
90769076
return __riscv_vfnmsac_vv_f32m1(a, b, c_dup, 2);
90779077
}
90789078

9079-
FORCE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c, const int __d) {
9080-
vuint16m1_t c_dup = __riscv_vrgather_vx_u16m1(c, __d, 4);
9079+
FORCE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c, const int lane) {
9080+
vuint16m1_t c_dup = __riscv_vrgather_vx_u16m1(c, lane, 4);
90819081
return __riscv_vnmsac_vv_u16m1(a, b, c_dup, 4);
90829082
}
90839083

9084-
FORCE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c, const int __d) {
9085-
vuint32m1_t c_dup = __riscv_vrgather_vx_u32m1(c, __d, 2);
9084+
FORCE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c, const int lane) {
9085+
vuint32m1_t c_dup = __riscv_vrgather_vx_u32m1(c, lane, 2);
90869086
return __riscv_vnmsac_vv_u32m1(a, b, c_dup, 2);
90879087
}
90889088

9089-
FORCE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int __d) {
9090-
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 8);
9089+
FORCE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int lane) {
9090+
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8);
90919091
return __riscv_vnmsac_vv_i16m1(a, b, c_dup, 8);
90929092
}
90939093

9094-
FORCE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int __d) {
9095-
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 4);
9094+
FORCE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int lane) {
9095+
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 4);
90969096
return __riscv_vnmsac_vv_i32m1(a, b, c_dup, 4);
90979097
}
90989098

9099-
FORCE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c, const int __d) {
9100-
vfloat32m1_t c_dup = __riscv_vrgather_vx_f32m1(c, __d, 4);
9099+
FORCE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c, const int lane) {
9100+
vfloat32m1_t c_dup = __riscv_vrgather_vx_f32m1(c, lane, 4);
91019101
return __riscv_vfnmsac_vv_f32m1(a, b, c_dup, 4);
91029102
}
91039103

9104-
// FORCE_INLINE int16x4_t vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, const int lane);
9104+
FORCE_INLINE int16x4_t vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t c, const int lane) {
9105+
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8);
9106+
return __riscv_vnmsac_vv_i16m1(a, b, c_dup, 4);
9107+
}
91059108

9106-
// FORCE_INLINE int16x8_t vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, const int lane);
9109+
FORCE_INLINE int16x8_t vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t c, const int lane) {
9110+
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8);
9111+
return __riscv_vnmsac_vv_i16m1(a, b, c_dup, 8);
9112+
}
91079113

9108-
// FORCE_INLINE int32x2_t vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, const int lane);
9114+
FORCE_INLINE int32x2_t vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t c, const int lane) {
9115+
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 4);
9116+
return __riscv_vnmsac_vv_i32m1(a, b, c_dup, 2);
9117+
}
91099118

9110-
// FORCE_INLINE int32x4_t vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v, const int lane);
9119+
FORCE_INLINE int32x4_t vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t c, const int lane) {
9120+
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 4);
9121+
return __riscv_vnmsac_vv_i32m1(a, b, c_dup, 4);
9122+
}
91119123

9112-
// FORCE_INLINE uint16x4_t vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v, const int lane);
9124+
FORCE_INLINE uint16x4_t vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t c, const int lane) {
9125+
vuint16m1_t c_dup = __riscv_vrgather_vx_u16m1(c, lane, 8);
9126+
return __riscv_vnmsac_vv_u16m1(a, b, c_dup, 4);
9127+
}
91139128

9114-
// FORCE_INLINE uint16x8_t vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v, const int lane);
9129+
FORCE_INLINE uint16x8_t vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c, const int lane) {
9130+
vuint16m1_t c_dup = __riscv_vrgather_vx_u16m1(c, lane, 8);
9131+
return __riscv_vnmsac_vv_u16m1(a, b, c_dup, 8);
9132+
}
91159133

9116-
// FORCE_INLINE uint32x2_t vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v, const int lane);
9134+
FORCE_INLINE uint32x2_t vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t c, const int lane) {
9135+
vuint32m1_t c_dup = __riscv_vrgather_vx_u32m1(c, lane, 4);
9136+
return __riscv_vnmsac_vv_u32m1(a, b, c_dup, 2);
9137+
}
91179138

9118-
// FORCE_INLINE uint32x4_t vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v, const int lane);
9139+
FORCE_INLINE uint32x4_t vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c, const int lane) {
9140+
vuint32m1_t c_dup = __riscv_vrgather_vx_u32m1(c, lane, 4);
9141+
return __riscv_vnmsac_vv_u32m1(a, b, c_dup, 4);
9142+
}
91199143

9120-
// FORCE_INLINE float32x2_t vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v, const int lane);
9144+
FORCE_INLINE float32x2_t vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t c, const int lane) {
9145+
vfloat32m1_t c_dup = __riscv_vrgather_vx_f32m1(c, lane, 4);
9146+
return __riscv_vfnmsac_vv_f32m1(a, b, c_dup, 2);
9147+
}
91219148

9122-
// FORCE_INLINE float32x4_t vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v, const int lane);
9149+
FORCE_INLINE float32x4_t vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, const int lane) {
9150+
vfloat32m1_t c_dup = __riscv_vrgather_vx_f32m1(c, lane, 4);
9151+
return __riscv_vfnmsac_vv_f32m1(a, b, c_dup, 4);
9152+
}
91239153

9124-
FORCE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c, const int __d) {
9125-
vuint16m1_t c_dup = __riscv_vrgather_vx_u16m1(c, __d, 8);
9154+
FORCE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c, const int lane) {
9155+
vuint16m1_t c_dup = __riscv_vrgather_vx_u16m1(c, lane, 8);
91269156
return __riscv_vnmsac_vv_u16m1(a, b, c_dup, 8);
91279157
}
91289158

9129-
FORCE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c, const int __d) {
9130-
vuint32m1_t c_dup = __riscv_vrgather_vx_u32m1(c, __d, 4);
9159+
FORCE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c, const int lane) {
9160+
vuint32m1_t c_dup = __riscv_vrgather_vx_u32m1(c, lane, 4);
91319161
return __riscv_vnmsac_vv_u32m1(a, b, c_dup, 4);
91329162
}
91339163

9134-
FORCE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c, const int __d) {
9135-
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 4);
9164+
FORCE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c, const int lane) {
9165+
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
91369166
return __riscv_vsub_vv_i32m1(a, __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vwmul_vv_i32m2(b, c_dup, 4)), 4);
91379167
}
91389168

9139-
FORCE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c, const int __d) {
9140-
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 2);
9169+
FORCE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c, const int lane) {
9170+
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
91419171
return __riscv_vsub_vv_i64m1(a, __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vwmul_vv_i64m2(b, c_dup, 2)), 2);
91429172
}
91439173

9144-
FORCE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c, const int __d) {
9145-
vuint16m1_t c_dup = __riscv_vrgather_vx_u16m1(c, __d, 8);
9174+
FORCE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c, const int lane) {
9175+
vuint16m1_t c_dup = __riscv_vrgather_vx_u16m1(c, lane, 8);
91469176
return __riscv_vsub_vv_u32m1(a, __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vwmulu_vv_u32m2(b, c_dup, 4)), 4);
91479177
}
91489178

9149-
FORCE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c, const int __d) {
9150-
vuint32m1_t c_dup = __riscv_vrgather_vx_u32m1(c, __d, 4);
9179+
FORCE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c, const int lane) {
9180+
vuint32m1_t c_dup = __riscv_vrgather_vx_u32m1(c, lane, 4);
91519181
return __riscv_vsub_vv_u64m1(a, __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vwmulu_vv_u64m2(b, c_dup, 2)), 2);
91529182
}
91539183

0 commit comments

Comments
 (0)