Skip to content

Commit b00a626

Browse files
authored
Merge pull request #474 from howjmay/mul
feat: Add vmul[q]_lane[q]_[s16|s32|u16|u32]
2 parents 27da1cb + 907c8e3 commit b00a626

File tree

3 files changed

+232
-24
lines changed

3 files changed

+232
-24
lines changed

neon2rvv.h

+32-8
Original file line numberDiff line numberDiff line change
@@ -8882,21 +8882,45 @@ FORCE_INLINE float64_t vmuld_lane_f64(float64_t a, float64x1_t b, const int lane
88828882
return a * b_lane;
88838883
}
88848884

8885-
// FORCE_INLINE int16x4_t vmul_laneq_s16(int16x4_t a, int16x8_t b, const int lane);
8885+
FORCE_INLINE int16x4_t vmul_laneq_s16(int16x4_t a, int16x8_t b, const int lane) {
8886+
vint16m1_t b_dup_lane = __riscv_vrgather_vx_i16m1(b, lane, 8);
8887+
return __riscv_vmul_vv_i16m1(a, b_dup_lane, 4);
8888+
}
88868889

8887-
// FORCE_INLINE int16x8_t vmulq_laneq_s16(int16x8_t a, int16x8_t b, const int lane);
8890+
FORCE_INLINE int16x8_t vmulq_laneq_s16(int16x8_t a, int16x8_t b, const int lane) {
8891+
vint16m1_t b_dup_lane = __riscv_vrgather_vx_i16m1(b, lane, 8);
8892+
return __riscv_vmul_vv_i16m1(a, b_dup_lane, 8);
8893+
}
88888894

8889-
// FORCE_INLINE int32x2_t vmul_laneq_s32(int32x2_t a, int32x4_t b, const int lane);
8895+
FORCE_INLINE int32x2_t vmul_laneq_s32(int32x2_t a, int32x4_t b, const int lane) {
8896+
vint32m1_t b_dup_lane = __riscv_vrgather_vx_i32m1(b, lane, 4);
8897+
return __riscv_vmul_vv_i32m1(a, b_dup_lane, 2);
8898+
}
88908899

8891-
// FORCE_INLINE int32x4_t vmulq_laneq_s32(int32x4_t a, int32x4_t b, const int lane);
8900+
FORCE_INLINE int32x4_t vmulq_laneq_s32(int32x4_t a, int32x4_t b, const int lane) {
8901+
vint32m1_t b_dup_lane = __riscv_vrgather_vx_i32m1(b, lane, 4);
8902+
return __riscv_vmul_vv_i32m1(a, b_dup_lane, 4);
8903+
}
88928904

8893-
// FORCE_INLINE uint16x4_t vmul_laneq_u16(uint16x4_t a, uint16x8_t b, const int lane);
8905+
FORCE_INLINE uint16x4_t vmul_laneq_u16(uint16x4_t a, uint16x8_t b, const int lane) {
8906+
vuint16m1_t b_dup_lane = __riscv_vrgather_vx_u16m1(b, lane, 8);
8907+
return __riscv_vmul_vv_u16m1(a, b_dup_lane, 4);
8908+
}
88948909

8895-
// FORCE_INLINE uint16x8_t vmulq_laneq_u16(uint16x8_t a, uint16x8_t b, const int lane);
8910+
FORCE_INLINE uint16x8_t vmulq_laneq_u16(uint16x8_t a, uint16x8_t b, const int lane) {
8911+
vuint16m1_t b_dup_lane = __riscv_vrgather_vx_u16m1(b, lane, 8);
8912+
return __riscv_vmul_vv_u16m1(a, b_dup_lane, 8);
8913+
}
88968914

8897-
// FORCE_INLINE uint32x2_t vmul_laneq_u32(uint32x2_t a, uint32x4_t b, const int lane);
8915+
FORCE_INLINE uint32x2_t vmul_laneq_u32(uint32x2_t a, uint32x4_t b, const int lane) {
8916+
vuint32m1_t b_dup_lane = __riscv_vrgather_vx_u32m1(b, lane, 4);
8917+
return __riscv_vmul_vv_u32m1(a, b_dup_lane, 2);
8918+
}
88988919

8899-
// FORCE_INLINE uint32x4_t vmulq_laneq_u32(uint32x4_t a, uint32x4_t b, const int lane);
8920+
FORCE_INLINE uint32x4_t vmulq_laneq_u32(uint32x4_t a, uint32x4_t b, const int lane) {
8921+
vuint32m1_t b_dup_lane = __riscv_vrgather_vx_u32m1(b, lane, 4);
8922+
return __riscv_vmul_vv_u32m1(a, b_dup_lane, 4);
8923+
}
89008924

89018925
FORCE_INLINE float32x2_t vmul_laneq_f32(float32x2_t a, float32x4_t b, const int lane) {
89028926
vfloat32m1_t b_dup_lane = __riscv_vrgather_vx_f32m1(b, lane, 4);

tests/impl.cpp

+192-8
Original file line numberDiff line numberDiff line change
@@ -31585,21 +31585,205 @@ result_t test_vmuld_lane_f64(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
3158531585
#endif // ENABLE_TEST_ALL
3158631586
}
3158731587

31588-
result_t test_vmul_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31588+
result_t test_vmul_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31589+
#ifdef ENABLE_TEST_ALL
31590+
const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
31591+
const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
31592+
int16_t _c[4];
31593+
int16x4_t a = vld1_s16(_a);
31594+
int16x8_t b = vld1q_s16(_b);
31595+
int16x4_t c;
31596+
31597+
#define TEST_IMPL(IDX) \
31598+
for (int i = 0; i < 4; i++) { \
31599+
_c[i] = _a[i] * _b[IDX]; \
31600+
} \
31601+
c = vmul_laneq_s16(a, b, IDX); \
31602+
CHECK_RESULT(validate_int16(c, _c[0], _c[1], _c[2], _c[3]))
31603+
31604+
IMM_8_ITER
31605+
#undef TEST_IMPL
31606+
31607+
return TEST_SUCCESS;
31608+
#else
31609+
return TEST_UNIMPL;
31610+
#endif // ENABLE_TEST_ALL
31611+
}
31612+
31613+
result_t test_vmulq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31614+
#ifdef ENABLE_TEST_ALL
31615+
const int16_t *_a = (int16_t *)impl.test_cases_int_pointer1;
31616+
const int16_t *_b = (int16_t *)impl.test_cases_int_pointer2;
31617+
int16_t _c[8];
31618+
int16x8_t a = vld1q_s16(_a);
31619+
int16x8_t b = vld1q_s16(_b);
31620+
int16x8_t c;
31621+
31622+
#define TEST_IMPL(IDX) \
31623+
for (int i = 0; i < 8; i++) { \
31624+
_c[i] = _a[i] * _b[IDX]; \
31625+
} \
31626+
c = vmulq_laneq_s16(a, b, IDX); \
31627+
CHECK_RESULT(validate_int16(c, _c[0], _c[1], _c[2], _c[3], _c[4], _c[5], _c[6], _c[7]))
31628+
31629+
IMM_8_ITER
31630+
#undef TEST_IMPL
31631+
31632+
return TEST_SUCCESS;
31633+
#else
31634+
return TEST_UNIMPL;
31635+
#endif // ENABLE_TEST_ALL
31636+
}
31637+
31638+
result_t test_vmul_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31639+
#ifdef ENABLE_TEST_ALL
31640+
const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
31641+
const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
31642+
int32_t _c[2];
31643+
int32x2_t a = vld1_s32(_a);
31644+
int32x4_t b = vld1q_s32(_b);
31645+
int32x2_t c;
31646+
31647+
#define TEST_IMPL(IDX) \
31648+
for (int i = 0; i < 2; i++) { \
31649+
_c[i] = _a[i] * _b[IDX]; \
31650+
} \
31651+
c = vmul_laneq_s32(a, b, IDX); \
31652+
CHECK_RESULT(validate_int32(c, _c[0], _c[1]))
31653+
31654+
IMM_4_ITER
31655+
#undef TEST_IMPL
31656+
31657+
return TEST_SUCCESS;
31658+
#else
31659+
return TEST_UNIMPL;
31660+
#endif // ENABLE_TEST_ALL
31661+
}
31662+
31663+
result_t test_vmulq_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31664+
#ifdef ENABLE_TEST_ALL
31665+
const int32_t *_a = (int32_t *)impl.test_cases_int_pointer1;
31666+
const int32_t *_b = (int32_t *)impl.test_cases_int_pointer2;
31667+
int32_t _c[4];
31668+
int32x4_t a = vld1q_s32(_a);
31669+
int32x4_t b = vld1q_s32(_b);
31670+
int32x4_t c;
31671+
31672+
#define TEST_IMPL(IDX) \
31673+
for (int i = 0; i < 4; i++) { \
31674+
_c[i] = _a[i] * _b[IDX]; \
31675+
} \
31676+
c = vmulq_laneq_s32(a, b, IDX); \
31677+
CHECK_RESULT(validate_int32(c, _c[0], _c[1], _c[2], _c[3]))
31678+
31679+
IMM_4_ITER
31680+
#undef TEST_IMPL
31681+
31682+
return TEST_SUCCESS;
31683+
#else
31684+
return TEST_UNIMPL;
31685+
#endif // ENABLE_TEST_ALL
31686+
}
31687+
31688+
result_t test_vmul_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31689+
#ifdef ENABLE_TEST_ALL
31690+
const uint16_t *_a = (uint16_t *)impl.test_cases_int_pointer1;
31691+
const uint16_t *_b = (uint16_t *)impl.test_cases_int_pointer2;
31692+
uint16_t _c[4];
31693+
uint16x4_t a = vld1_u16(_a);
31694+
uint16x8_t b = vld1q_u16(_b);
31695+
uint16x4_t c;
31696+
31697+
#define TEST_IMPL(IDX) \
31698+
for (int i = 0; i < 4; i++) { \
31699+
_c[i] = _a[i] * _b[IDX]; \
31700+
} \
31701+
c = vmul_laneq_u16(a, b, IDX); \
31702+
CHECK_RESULT(validate_uint16(c, _c[0], _c[1], _c[2], _c[3]))
31703+
31704+
IMM_8_ITER
31705+
#undef TEST_IMPL
31706+
31707+
return TEST_SUCCESS;
31708+
#else
31709+
return TEST_UNIMPL;
31710+
#endif // ENABLE_TEST_ALL
31711+
}
31712+
31713+
result_t test_vmulq_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31714+
#ifdef ENABLE_TEST_ALL
31715+
const uint16_t *_a = (uint16_t *)impl.test_cases_int_pointer1;
31716+
const uint16_t *_b = (uint16_t *)impl.test_cases_int_pointer2;
31717+
uint16_t _c[8];
31718+
uint16x8_t a = vld1q_u16(_a);
31719+
uint16x8_t b = vld1q_u16(_b);
31720+
uint16x8_t c;
31721+
31722+
#define TEST_IMPL(IDX) \
31723+
for (int i = 0; i < 8; i++) { \
31724+
_c[i] = _a[i] * _b[IDX]; \
31725+
} \
31726+
c = vmulq_laneq_u16(a, b, IDX); \
31727+
CHECK_RESULT(validate_uint16(c, _c[0], _c[1], _c[2], _c[3], _c[4], _c[5], _c[6], _c[7]))
31728+
31729+
IMM_8_ITER
31730+
#undef TEST_IMPL
31731+
31732+
return TEST_SUCCESS;
31733+
#else
31734+
return TEST_UNIMPL;
31735+
#endif // ENABLE_TEST_ALL
31736+
}
31737+
31738+
result_t test_vmul_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31739+
#ifdef ENABLE_TEST_ALL
31740+
const uint32_t *_a = (uint32_t *)impl.test_cases_int_pointer1;
31741+
const uint32_t *_b = (uint32_t *)impl.test_cases_int_pointer2;
31742+
uint32_t _c[2];
31743+
uint32x2_t a = vld1_u32(_a);
31744+
uint32x4_t b = vld1q_u32(_b);
31745+
uint32x2_t c;
3158931746

31590-
result_t test_vmulq_laneq_s16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31747+
#define TEST_IMPL(IDX) \
31748+
for (int i = 0; i < 2; i++) { \
31749+
_c[i] = _a[i] * _b[IDX]; \
31750+
} \
31751+
c = vmul_laneq_u32(a, b, IDX); \
31752+
CHECK_RESULT(validate_uint32(c, _c[0], _c[1]))
3159131753

31592-
result_t test_vmul_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31754+
IMM_4_ITER
31755+
#undef TEST_IMPL
3159331756

31594-
result_t test_vmulq_laneq_s32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31757+
return TEST_SUCCESS;
31758+
#else
31759+
return TEST_UNIMPL;
31760+
#endif // ENABLE_TEST_ALL
31761+
}
3159531762

31596-
result_t test_vmul_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31763+
result_t test_vmulq_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
31764+
#ifdef ENABLE_TEST_ALL
31765+
const uint32_t *_a = (uint32_t *)impl.test_cases_int_pointer1;
31766+
const uint32_t *_b = (uint32_t *)impl.test_cases_int_pointer2;
31767+
uint32_t _c[4];
31768+
uint32x4_t a = vld1q_u32(_a);
31769+
uint32x4_t b = vld1q_u32(_b);
31770+
uint32x4_t c;
3159731771

31598-
result_t test_vmulq_laneq_u16(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31772+
#define TEST_IMPL(IDX) \
31773+
for (int i = 0; i < 4; i++) { \
31774+
_c[i] = _a[i] * _b[IDX]; \
31775+
} \
31776+
c = vmulq_laneq_u32(a, b, IDX); \
31777+
CHECK_RESULT(validate_uint32(c, _c[0], _c[1], _c[2], _c[3]))
3159931778

31600-
result_t test_vmul_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31779+
IMM_4_ITER
31780+
#undef TEST_IMPL
3160131781

31602-
result_t test_vmulq_laneq_u32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) { return TEST_UNIMPL; }
31782+
return TEST_SUCCESS;
31783+
#else
31784+
return TEST_UNIMPL;
31785+
#endif // ENABLE_TEST_ALL
31786+
}
3160331787

3160431788
result_t test_vmul_laneq_f32(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {
3160531789
#ifdef ENABLE_TEST_ALL

tests/impl.h

+8-8
Original file line numberDiff line numberDiff line change
@@ -1894,14 +1894,14 @@
18941894
_(vmulq_lane_f64) \
18951895
_(vmuls_lane_f32) \
18961896
_(vmuld_lane_f64) \
1897-
/*_(vmul_laneq_s16) */ \
1898-
/*_(vmulq_laneq_s16) */ \
1899-
/*_(vmul_laneq_s32) */ \
1900-
/*_(vmulq_laneq_s32) */ \
1901-
/*_(vmul_laneq_u16) */ \
1902-
/*_(vmulq_laneq_u16) */ \
1903-
/*_(vmul_laneq_u32) */ \
1904-
/*_(vmulq_laneq_u32) */ \
1897+
_(vmul_laneq_s16) \
1898+
_(vmulq_laneq_s16) \
1899+
_(vmul_laneq_s32) \
1900+
_(vmulq_laneq_s32) \
1901+
_(vmul_laneq_u16) \
1902+
_(vmulq_laneq_u16) \
1903+
_(vmul_laneq_u32) \
1904+
_(vmulq_laneq_u32) \
19051905
_(vmul_laneq_f32) \
19061906
_(vmulq_laneq_f32) \
19071907
_(vmul_laneq_f64) \

0 commit comments

Comments
 (0)