Skip to content

Commit c8786ba

Browse files
authored
Merge pull request #455 from howjmay/vmlal_high_lane
feat: Add vmlal_high_lane_[s16|s32|u16|u32]
2 parents 62e9ed1 + 571b166 commit c8786ba

File tree

3 files changed

+132
-12
lines changed

3 files changed

+132
-12
lines changed

neon2rvv.h

+24-4
Original file line numberDiff line numberDiff line change
@@ -8917,13 +8917,33 @@ FORCE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c,
89178917
return __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vwmaccu_vv_u64m2(__riscv_vlmul_ext_v_u64m1_u64m2(a), b, c_dup, 2));
89188918
}
89198919

8920-
// FORCE_INLINE int32x4_t vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v, const int lane);
8920+
FORCE_INLINE int32x4_t vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t c, const int lane) {
8921+
vint16m1_t b_high = __riscv_vslidedown_vx_i16m1(b, 4, 8);
8922+
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
8923+
return __riscv_vlmul_trunc_v_i32m2_i32m1(
8924+
__riscv_vwmacc_vv_i32m2(__riscv_vlmul_ext_v_i32m1_i32m2(a), b_high, c_dup, 4));
8925+
}
89218926

8922-
// FORCE_INLINE int64x2_t vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v, const int lane);
8927+
FORCE_INLINE int64x2_t vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t c, const int lane) {
8928+
vint32m1_t b_high = __riscv_vslidedown_vx_i32m1(b, 2, 4);
8929+
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
8930+
return __riscv_vlmul_trunc_v_i64m2_i64m1(
8931+
__riscv_vwmacc_vv_i64m2(__riscv_vlmul_ext_v_i64m1_i64m2(a), b_high, c_dup, 2));
8932+
}
89238933

8924-
// FORCE_INLINE uint32x4_t vmlal_high_lane_u16(uint32x4_t a, uint16x8_t b, uint16x4_t v, const int lane);
8934+
FORCE_INLINE uint32x4_t vmlal_high_lane_u16(uint32x4_t a, uint16x8_t b, uint16x4_t c, const int lane) {
8935+
vuint16m1_t b_high = __riscv_vslidedown_vx_u16m1(b, 4, 8);
8936+
vuint16m1_t c_dup = __riscv_vrgather_vx_u16m1(c, lane, 4);
8937+
return __riscv_vlmul_trunc_v_u32m2_u32m1(
8938+
__riscv_vwmaccu_vv_u32m2(__riscv_vlmul_ext_v_u32m1_u32m2(a), b_high, c_dup, 4));
8939+
}
89258940

8926-
// FORCE_INLINE uint64x2_t vmlal_high_lane_u32(uint64x2_t a, uint32x4_t b, uint32x2_t v, const int lane);
8941+
FORCE_INLINE uint64x2_t vmlal_high_lane_u32(uint64x2_t a, uint32x4_t b, uint32x2_t c, const int lane) {
8942+
vuint32m1_t b_high = __riscv_vslidedown_vx_u32m1(b, 2, 4);
8943+
vuint32m1_t c_dup = __riscv_vrgather_vx_u32m1(c, lane, 2);
8944+
return __riscv_vlmul_trunc_v_u64m2_u64m1(
8945+
__riscv_vwmaccu_vv_u64m2(__riscv_vlmul_ext_v_u64m1_u64m2(a), b_high, c_dup, 2));
8946+
}
89278947

89288948
// FORCE_INLINE int32x4_t vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v, const int lane);
89298949

tests/impl.cpp

+104-4
Original file line numberDiff line numberDiff line change
@@ -31507,13 +31507,113 @@ result_t test_vmlal_lane_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
3150731507
#endif // ENABLE_TEST_ALL
3150831508
}
3150931509

31510-
result_t test_vmlal_high_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31510+
result_t test_vmlal_high_lane_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31511+
#ifdef ENABLE_TEST_ALL
31512+
const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
31513+
const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
31514+
const int16_t *_c = (int16_t *)impl.test_cases_int_pointer3;
31515+
int32_t _d[8];
31516+
int32x4_t a = vld1q_s32(_a);
31517+
int16x8_t b = vld1q_s16(_b);
31518+
int16x4_t c = vld1_s16(_c);
31519+
int32x4_t d;
31520+
31521+
#define TEST_IMPL(IDX) \
31522+
for (int i = 0; i < 4; i++) { \
31523+
_d[i] = _a[i] + (int32_t)_b[i + 4] * (int32_t)_c[IDX]; \
31524+
} \
31525+
d = vmlal_high_lane_s16(a, b, c, IDX); \
31526+
CHECK_RESULT(validate_int32(d, _d[0], _d[1], _d[2], _d[3]))
31527+
31528+
IMM_4_ITER
31529+
#undef TEST_IMPL
31530+
31531+
return TEST_SUCCESS;
31532+
#else
31533+
return TEST_UNIMPL;
31534+
#endif // ENABLE_TEST_ALL
31535+
}
3151131536

31512-
result_t test_vmlal_high_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31537+
result_t test_vmlal_high_lane_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31538+
#ifdef ENABLE_TEST_ALL
31539+
const int64_t *_a = (int64_t *)impl.test_cases_int_pointer1;
31540+
const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
31541+
const int32_t *_c = (int32_t *)impl.test_cases_int_pointer3;
31542+
int64_t _d[4];
31543+
int64x2_t a = vld1q_s64(_a);
31544+
int32x4_t b = vld1q_s32(_b);
31545+
int32x2_t c = vld1_s32(_c);
31546+
int64x2_t d;
31547+
31548+
#define TEST_IMPL(IDX) \
31549+
for (int i = 0; i < 2; i++) { \
31550+
_d[i] = _a[i] + (int64_t)_b[i + 2] * (int64_t)_c[IDX]; \
31551+
} \
31552+
d = vmlal_high_lane_s32(a, b, c, IDX); \
31553+
CHECK_RESULT(validate_int64(d, _d[0], _d[1]))
3151331554

31514-
result_t test_vmlal_high_lane_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31555+
IMM_2_ITER
31556+
#undef TEST_IMPL
31557+
31558+
return TEST_SUCCESS;
31559+
#else
31560+
return TEST_UNIMPL;
31561+
#endif // ENABLE_TEST_ALL
31562+
}
3151531563

31516-
result_t test_vmlal_high_lane_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31564+
result_t test_vmlal_high_lane_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31565+
#ifdef ENABLE_TEST_ALL
31566+
const uint32_t *_a = (uint32_t *)impl.test_cases_int_pointer1;
31567+
const uint16_t *_b = (uint16_t *)impl.test_cases_int_pointer2;
31568+
const uint16_t *_c = (uint16_t *)impl.test_cases_int_pointer3;
31569+
uint32_t _d[8];
31570+
uint32x4_t a = vld1q_u32(_a);
31571+
uint16x8_t b = vld1q_u16(_b);
31572+
uint16x4_t c = vld1_u16(_c);
31573+
uint32x4_t d;
31574+
31575+
#define TEST_IMPL(IDX) \
31576+
for (int i = 0; i < 4; i++) { \
31577+
_d[i] = _a[i] + (uint32_t)_b[i + 4] * (uint32_t)_c[IDX]; \
31578+
} \
31579+
d = vmlal_high_lane_u16(a, b, c, IDX); \
31580+
CHECK_RESULT(validate_uint32(d, _d[0], _d[1], _d[2], _d[3]))
31581+
31582+
IMM_4_ITER
31583+
#undef TEST_IMPL
31584+
31585+
return TEST_SUCCESS;
31586+
#else
31587+
return TEST_UNIMPL;
31588+
#endif // ENABLE_TEST_ALL
31589+
}
31590+
31591+
result_t test_vmlal_high_lane_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31592+
#ifdef ENABLE_TEST_ALL
31593+
const uint64_t *_a = (uint64_t *)impl.test_cases_int_pointer1;
31594+
const uint32_t *_b = (uint32_t *)impl.test_cases_int_pointer2;
31595+
const uint32_t *_c = (uint32_t *)impl.test_cases_int_pointer3;
31596+
uint64_t _d[4];
31597+
uint64x2_t a = vld1q_u64(_a);
31598+
uint32x4_t b = vld1q_u32(_b);
31599+
uint32x2_t c = vld1_u32(_c);
31600+
uint64x2_t d;
31601+
31602+
#define TEST_IMPL(IDX) \
31603+
for (int i = 0; i < 2; i++) { \
31604+
_d[i] = _a[i] + (uint64_t)_b[i + 2] * (uint64_t)_c[IDX]; \
31605+
} \
31606+
d = vmlal_high_lane_u32(a, b, c, IDX); \
31607+
CHECK_RESULT(validate_uint64(d, _d[0], _d[1]))
31608+
31609+
IMM_2_ITER
31610+
#undef TEST_IMPL
31611+
31612+
return TEST_SUCCESS;
31613+
#else
31614+
return TEST_UNIMPL;
31615+
#endif // ENABLE_TEST_ALL
31616+
}
3151731617

3151831618
result_t test_vmlal_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
3151931619

tests/impl.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -1934,10 +1934,10 @@
19341934
_(vmlal_lane_s32) \
19351935
_(vmlal_lane_u16) \
19361936
_(vmlal_lane_u32) \
1937-
/*_(vmlal_high_lane_s16) */ \
1938-
/*_(vmlal_high_lane_s32) */ \
1939-
/*_(vmlal_high_lane_u16) */ \
1940-
/*_(vmlal_high_lane_u32) */ \
1937+
_(vmlal_high_lane_s16) \
1938+
_(vmlal_high_lane_s32) \
1939+
_(vmlal_high_lane_u16) \
1940+
_(vmlal_high_lane_u32) \
19411941
/*_(vmlal_laneq_s16) */ \
19421942
/*_(vmlal_laneq_s32) */ \
19431943
/*_(vmlal_laneq_u16) */ \

0 commit comments

Comments
 (0)