@@ -8993,15 +8993,15 @@ FORCE_INLINE uint64x2_t vmlal_high_laneq_u32(uint64x2_t a, uint32x4_t b, uint32x
8993
8993
__riscv_vwmaccu_vv_u64m2(__riscv_vlmul_ext_v_u64m1_u64m2(a), b_high, c_dup, 2));
8994
8994
}
8995
8995
8996
- FORCE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c, const int __d ) {
8997
- vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d , 4);
8996
+ FORCE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c, const int lane ) {
8997
+ vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane , 4);
8998
8998
vint32m1_t bc_mul = __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vwmul_vv_i32m2(b, c_dup, 4));
8999
8999
vint32m1_t bc_mulx2 = __riscv_vmul_vx_i32m1(bc_mul, 2, 4);
9000
9000
return __riscv_vadd_vv_i32m1(a, bc_mulx2, 4);
9001
9001
}
9002
9002
9003
- FORCE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c, const int __d ) {
9004
- vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d , 2);
9003
+ FORCE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c, const int lane ) {
9004
+ vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane , 2);
9005
9005
vint64m1_t bc_mul = __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vwmul_vv_i64m2(b, c_dup, 2));
9006
9006
vint64m1_t bc_mulx2 = __riscv_vmul_vx_i64m1(bc_mul, 2, 2);
9007
9007
return __riscv_vadd_vv_i64m1(a, bc_mulx2, 2);
@@ -9011,9 +9011,21 @@ FORCE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c, c
9011
9011
9012
9012
// FORCE_INLINE int64_t vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t v, const int lane);
9013
9013
9014
- // FORCE_INLINE int32x4_t vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v, const int lane);
9014
+ FORCE_INLINE int32x4_t vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t c, const int lane) {
9015
+ vint16m1_t b_high = __riscv_vslidedown_vx_i16m1(b, 4, 8);
9016
+ vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
9017
+ vint32m1_t bc_mul = __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vwmul_vv_i32m2(b_high, c_dup, 4));
9018
+ vint32m1_t bc_mulx2 = __riscv_vmul_vx_i32m1(bc_mul, 2, 4);
9019
+ return __riscv_vadd_vv_i32m1(a, bc_mulx2, 4);
9020
+ }
9015
9021
9016
- // FORCE_INLINE int64x2_t vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v, const int lane);
9022
+ FORCE_INLINE int64x2_t vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t c, const int lane) {
9023
+ vint32m1_t b_high = __riscv_vslidedown_vx_i32m1(b, 2, 4);
9024
+ vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
9025
+ vint64m1_t bc_mul = __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vwmul_vv_i64m2(b_high, c_dup, 2));
9026
+ vint64m1_t bc_mulx2 = __riscv_vmul_vx_i64m1(bc_mul, 2, 2);
9027
+ return __riscv_vadd_vv_i64m1(a, bc_mulx2, 2);
9028
+ }
9017
9029
9018
9030
// FORCE_INLINE int32x4_t vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v, const int lane);
9019
9031
0 commit comments