Skip to content

Commit

Permalink
V256 tests passed; Critical fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Epixu committed Jun 14, 2024
1 parent 2324338 commit 06de125
Show file tree
Hide file tree
Showing 28 changed files with 709 additions and 565 deletions.
104 changes: 48 additions & 56 deletions source/Common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,10 +342,6 @@ namespace Langulus::SIMD
return V128<std::int64_t> {simde_mm_unpacklo_epi32(m, Zero())};
else if constexpr (CT::UnsignedInteger32<T>)
return V128<std::uint64_t> {simde_mm_unpacklo_epi32(m, Zero())};
else if constexpr (CT::SignedInteger64<T>)
return V128<std::int64_t> {simde_mm_unpacklo_epi64(m, Zero())};
else if constexpr (CT::UnsignedInteger64<T>)
return V128<std::uint64_t> {simde_mm_unpacklo_epi64(m, Zero())};
else
LANGULUS_ERROR("Can't unpack this type");
}
Expand All @@ -364,10 +360,6 @@ namespace Langulus::SIMD
return V128<std::int64_t> {simde_mm_unpackhi_epi32(m, Zero())};
else if constexpr (CT::UnsignedInteger32<T>)
return V128<std::uint64_t> {simde_mm_unpackhi_epi32(m, Zero())};
else if constexpr (CT::SignedInteger64<T>)
return V128<std::int64_t> {simde_mm_unpackhi_epi64(m, Zero())};
else if constexpr (CT::UnsignedInteger64<T>)
return V128<std::uint64_t> {simde_mm_unpackhi_epi64(m, Zero())};
else
LANGULUS_ERROR("Can't unpack this type");
}
Expand Down Expand Up @@ -400,7 +392,7 @@ namespace Langulus::SIMD
// (or vpermq if you want)
auto ordered = simde_mm_permute_pd(
simde_mm_castps_pd(combined),
SIMDE_MM_SHUFFLE(1, 0, 1, 0)
SIMDE_MM_SHUFFLE(0, 0, 0, 1)
);
return V128<std::int32_t> {simde_mm_castpd_si128(ordered)};
#endif
Expand All @@ -421,7 +413,7 @@ namespace Langulus::SIMD
// (or vpermq if you want)
auto ordered = simde_mm_permute_pd(
simde_mm_castps_pd(combined),
SIMDE_MM_SHUFFLE(1, 0, 1, 0)
SIMDE_MM_SHUFFLE(0, 0, 0, 1)
);
return V128<std::uint32_t> {simde_mm_castpd_si128(ordered)};
#endif
Expand Down Expand Up @@ -529,43 +521,35 @@ namespace Langulus::SIMD
NOD() LANGULUS(INLINED)
auto UnpackLo() const noexcept {
if constexpr (CT::SignedInteger8<T>)
return V256<std::int16_t> {simde_mm256_unpacklo_epi8 (m, Zero())};
return V256<std::int16_t> {simde_mm256_cvtepi8_epi16 (simde_mm256_extractf128_si256(m, 0))};
else if constexpr (CT::UnsignedInteger8<T>)
return V256<std::uint16_t> {simde_mm256_unpacklo_epi8 (m, Zero())};
return V256<std::uint16_t> {simde_mm256_cvtepu8_epi16 (simde_mm256_extractf128_si256(m, 0))};
else if constexpr (CT::SignedInteger16<T>)
return V256<std::int32_t> {simde_mm256_unpacklo_epi16(m, Zero())};
return V256<std::int32_t> {simde_mm256_cvtepi16_epi32(simde_mm256_extractf128_si256(m, 0))};
else if constexpr (CT::UnsignedInteger16<T>)
return V256<std::uint32_t> {simde_mm256_unpacklo_epi16(m, Zero())};
return V256<std::uint32_t> {simde_mm256_cvtepu16_epi32(simde_mm256_extractf128_si256(m, 0))};
else if constexpr (CT::SignedInteger32<T>)
return V256<std::int64_t> {simde_mm256_unpacklo_epi32(m, Zero())};
return V256<std::int64_t> {simde_mm256_cvtepi32_epi64(simde_mm256_extractf128_si256(m, 0))};
else if constexpr (CT::UnsignedInteger32<T>)
return V256<std::uint64_t> {simde_mm256_unpacklo_epi32(m, Zero())};
else if constexpr (CT::SignedInteger64<T>)
return V256<std::int64_t> {simde_mm256_unpacklo_epi64(m, Zero())};
else if constexpr (CT::UnsignedInteger64<T>)
return V256<std::uint64_t> {simde_mm256_unpacklo_epi64(m, Zero())};
return V256<std::uint64_t> {simde_mm256_cvtepu32_epi64(simde_mm256_extractf128_si256(m, 0))};
else
LANGULUS_ERROR("Can't unpack this type");
}

NOD() LANGULUS(INLINED)
auto UnpackHi() const noexcept {
if constexpr (CT::SignedInteger8<T>)
return V256<std::int16_t> {simde_mm256_unpackhi_epi8 (m, Zero())};
return V256<std::int16_t> {simde_mm256_cvtepi8_epi16 (simde_mm256_extractf128_si256(m, 1))};
else if constexpr (CT::UnsignedInteger8<T>)
return V256<std::uint16_t> {simde_mm256_unpackhi_epi8 (m, Zero())};
return V256<std::uint16_t> {simde_mm256_cvtepu8_epi16 (simde_mm256_extractf128_si256(m, 1))};
else if constexpr (CT::SignedInteger16<T>)
return V256<std::int32_t> {simde_mm256_unpackhi_epi16(m, Zero())};
return V256<std::int32_t> {simde_mm256_cvtepi16_epi32(simde_mm256_extractf128_si256(m, 1))};
else if constexpr (CT::UnsignedInteger16<T>)
return V256<std::uint32_t> {simde_mm256_unpackhi_epi16(m, Zero())};
return V256<std::uint32_t> {simde_mm256_cvtepu16_epi32(simde_mm256_extractf128_si256(m, 1))};
else if constexpr (CT::SignedInteger32<T>)
return V256<std::int64_t> {simde_mm256_unpackhi_epi32(m, Zero())};
return V256<std::int64_t> {simde_mm256_cvtepi32_epi64(simde_mm256_extractf128_si256(m, 1))};
else if constexpr (CT::UnsignedInteger32<T>)
return V256<std::uint64_t> {simde_mm256_unpackhi_epi32(m, Zero())};
else if constexpr (CT::SignedInteger64<T>)
return V256<std::int64_t> {simde_mm256_unpackhi_epi64(m, Zero())};
else if constexpr (CT::UnsignedInteger64<T>)
return V256<std::uint64_t> {simde_mm256_unpackhi_epi64(m, Zero())};
return V256<std::uint64_t> {simde_mm256_cvtepu32_epi64(simde_mm256_extractf128_si256(m, 1))};
else
LANGULUS_ERROR("Can't unpack this type");
}
Expand All @@ -574,10 +558,16 @@ namespace Langulus::SIMD
auto Pack() const noexcept {
if constexpr (CT::Integer8<T>)
return *this;
else if constexpr (CT::SignedInteger16<T>)
return V256<std::int8_t> {simde_mm256_packs_epi16 (m, Zero())};
else if constexpr (CT::UnsignedInteger16<T>)
return V256<std::uint8_t> {simde_mm256_packus_epi16(m, Zero())};
else if constexpr (CT::SignedInteger16<T>) {
const auto lo_lane = simde_mm256_castsi256_si128(m);
const auto hi_lane = simde_mm256_extracti128_si256(m, 1);
return V128<std::int8_t> {simde_mm_packs_epi16(lo_lane, hi_lane)};
}
else if constexpr (CT::UnsignedInteger16<T>) {
const auto lo_lane = simde_mm256_castsi256_si128(m);
const auto hi_lane = simde_mm256_extracti128_si256(m, 1);
return V128<std::uint8_t> {simde_mm_packus_epi16(lo_lane, hi_lane)};
}
else if constexpr (CT::SignedInteger32<T>)
return V256<std::int16_t> {simde_mm256_packs_epi32 (m, Zero())};
else if constexpr (CT::UnsignedInteger32<T>)
Expand Down Expand Up @@ -740,10 +730,6 @@ namespace Langulus::SIMD
return V512<std::int64_t> {simde_mm512_unpacklo_epi32(m, Zero())};
else if constexpr (CT::UnsignedInteger32<T>)
return V512<std::uint64_t> {simde_mm512_unpacklo_epi32(m, Zero())};
else if constexpr (CT::SignedInteger64<T>)
return V512<std::int64_t> {simde_mm512_unpacklo_epi64(m, Zero())};
else if constexpr (CT::UnsignedInteger64<T>)
return V512<std::uint64_t> {simde_mm512_unpacklo_epi64(m, Zero())};
else
LANGULUS_ERROR("Can't unpack this type");
}
Expand All @@ -762,10 +748,6 @@ namespace Langulus::SIMD
return V512<std::int64_t> {simde_mm512_unpackhi_epi32(m, Zero())};
else if constexpr (CT::UnsignedInteger32<T>)
return V512<std::uint64_t> {simde_mm512_unpackhi_epi32(m, Zero())};
else if constexpr (CT::SignedInteger64<T>)
return V512<std::int64_t> {simde_mm512_unpackhi_epi64(m, Zero())};
else if constexpr (CT::UnsignedInteger64<T>)
return V512<std::uint64_t> {simde_mm512_unpackhi_epi64(m, Zero())};
else
LANGULUS_ERROR("Can't unpack this type");
}
Expand All @@ -774,10 +756,20 @@ namespace Langulus::SIMD
auto Pack() const noexcept {
if constexpr (CT::Integer8<T>)
return *this;
else if constexpr (CT::SignedInteger16<T>)
return V512<std::int8_t> {simde_mm512_packs_epi16 (m, Zero())};
else if constexpr (CT::UnsignedInteger16<T>)
return V512<std::uint8_t> {simde_mm512_packus_epi16(m, Zero())};
else if constexpr (CT::SignedInteger16<T>) {
const auto lo_lane = simde_mm512_castsi512_si256(m);
const auto hi_lane = simde_mm512_extracti256_si512(m, 1);
return V256<std::int8_t> {
simde_mm256_packs_epi16(lo_lane, hi_lane)
}.Pack();
}
else if constexpr (CT::UnsignedInteger16<T>) {
const auto lo_lane = simde_mm512_castsi512_si256(m);
const auto hi_lane = simde_mm512_extracti256_si512(m, 1);
return V256<std::uint8_t> {
simde_mm256_packus_epi16(lo_lane, hi_lane)
}.Pack();
}
else if constexpr (CT::SignedInteger32<T>)
return V512<std::int16_t> {simde_mm512_packs_epi32 (m, Zero())};
else if constexpr (CT::UnsignedInteger32<T>)
Expand Down Expand Up @@ -1319,12 +1311,12 @@ namespace Langulus::SIMD
);

const auto C1 = simde_mm_or_si128(
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(low.m, 0), maskLo),
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(low.m, 1), maskHi)
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(low, 0), maskLo),
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(low, 1), maskHi)
);
const auto C2 = simde_mm_or_si128(
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(high.m, 0), maskLo),
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(high.m, 1), maskHi)
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(high, 0), maskLo),
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(high, 1), maskHi)
);

const auto C = simde_mm256_inserti128_si256(simde_mm256_setzero_si256(), C1, 0);
Expand Down Expand Up @@ -1368,8 +1360,8 @@ namespace Langulus::SIMD
);

const auto r = simde_mm_or_si128(
simde_mm_shuffle_epi8(low.m, maskLo),
simde_mm_shuffle_epi8(high.m, maskHi)
simde_mm_shuffle_epi8(low, maskLo),
simde_mm_shuffle_epi8(high, maskHi)
);
#endif

Expand Down Expand Up @@ -1403,12 +1395,12 @@ namespace Langulus::SIMD
);

const auto C1 = simde_mm_or_si128(
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(low.m, 0), maskLo),
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(low.m, 1), maskHi)
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(low, 0), maskLo),
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(low, 1), maskHi)
);
const auto C2 = simde_mm_or_si128(
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(high.m, 0), maskLo),
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(high.m, 1), maskHi)
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(high, 0), maskLo),
simde_mm_shuffle_epi8(simde_mm256_extracti128_si256(high, 1), maskHi)
);

auto C = simde_mm256_inserti128_si256(simde_mm256_setzero_si256(), C1, 0);
Expand Down
47 changes: 18 additions & 29 deletions source/SetGet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace Langulus::SIMD

if constexpr (REVERSE) {
if constexpr (MAXS - IDX - 1 < S) {
LANGULUS_SIMD_VERBOSE("Setting [", IDX, "] to ", DenseCast(values[MAXS - IDX - 1]));
LANGULUS_SIMD_VERBOSE("Setting [", IDX, "] to ", values[MAXS - IDX - 1]);
return reinterpret_cast<const R&>(values[MAXS - IDX - 1]);
}
else {
Expand All @@ -42,7 +42,7 @@ namespace Langulus::SIMD
}
else {
if constexpr (IDX < S) {
LANGULUS_SIMD_VERBOSE("Setting [", IDX, "] to ", DenseCast(values[IDX]));
LANGULUS_SIMD_VERBOSE("Setting [", IDX, "] to ", values[IDX]);
return reinterpret_cast<const R&>(values[IDX]);
}
else {
Expand All @@ -67,15 +67,13 @@ namespace Langulus::SIMD

#if LANGULUS_SIMD(128BIT)
if constexpr (CHUNK == 16) {
LANGULUS_SIMD_VERBOSE("Setting 128bit register from ",
CountOf<FROM>, " elements");

if constexpr (CT::Integer8<T>) return V128<T> {simde_mm_setr_epi8 (Get<int8_t, DEF, INDICES, 16>(values)...)};
else if constexpr (CT::Integer16<T>) return V128<T> {simde_mm_setr_epi16(Get<int16_t, DEF, INDICES, 8 >(values)...)};
else if constexpr (CT::Integer32<T>) return V128<T> {simde_mm_setr_epi32(Get<int32_t, DEF, INDICES, 4 >(values)...)};
else if constexpr (CT::Integer64<T>) return V128<T> {simde_mm_set_epi64x(Get<int64_t, DEF, INDICES, 2, true>(values)...)};
else if constexpr (CT::Float<T>) return V128<T> {simde_mm_setr_ps (Get<simde_float32, DEF, INDICES, 4 >(values)...)};
else if constexpr (CT::Double<T>) return V128<T> {simde_mm_setr_pd (Get<simde_float64, DEF, INDICES, 2 >(values)...)};
LANGULUS_SIMD_VERBOSE("Setting 128bit register from ", CountOf<FROM>, " elements");
if constexpr (CT::Integer8<T>) return V128<T> {simde_mm_setr_epi8 (Get<int8_t, DEF, INDICES, 16>(values)...)};
else if constexpr (CT::Integer16<T>) return V128<T> {simde_mm_setr_epi16(Get<int16_t, DEF, INDICES, 8>(values)...)};
else if constexpr (CT::Integer32<T>) return V128<T> {simde_mm_setr_epi32(Get<int32_t, DEF, INDICES, 4>(values)...)};
else if constexpr (CT::Integer64<T>) return V128<T> {simde_mm_set_epi64x(Get<int64_t, DEF, INDICES, 2, true>(values)...)};
else if constexpr (CT::Float<T>) return V128<T> {simde_mm_setr_ps (Get<simde_float32, DEF, INDICES, 4>(values)...)};
else if constexpr (CT::Double<T>) return V128<T> {simde_mm_setr_pd (Get<simde_float64, DEF, INDICES, 2>(values)...)};
else LANGULUS_ERROR("Can't set 16-byte package");
}
else
Expand All @@ -84,23 +82,18 @@ namespace Langulus::SIMD
#if LANGULUS_SIMD(256BIT)
if constexpr (CHUNK == 32) {
LANGULUS_SIMD_VERBOSE("Setting 256bit register from ", CountOf<FROM>, " elements");

if constexpr (CT::Integer8<T>) return V256<T> {simde_mm256_setr_epi8 (Get<int8_t, DEF, INDICES, 32>(values)...)};
if constexpr (CT::Integer8<T>) return V256<T> {simde_mm256_setr_epi8 (Get<int8_t, DEF, INDICES, 32>(values)...)};
else if constexpr (CT::Integer16<T>) return V256<T> {simde_mm256_setr_epi16(Get<int16_t, DEF, INDICES, 16>(values)...)};
else if constexpr (CT::Integer32<T>) return V256<T> {simde_mm256_setr_epi32(Get<int32_t, DEF, INDICES, 8 >(values)...)};
else if constexpr (CT::Integer32<T>) return V256<T> {simde_mm256_setr_epi32(Get<int32_t, DEF, INDICES, 8>(values)...)};
else if constexpr (CT::Integer64<T>) {
// This hits a very nasty MSVC compiler bug
// The workaround is temporary, hopefully
// https://stackoverflow.com/questions/77191454
#if LANGULUS_COMPILER(MSVC) and LANGULUS_BITNESS() == 32 and (LANGULUS_SIMD(AVX) or LANGULUS_SIMD(AVX2))
alignas(32) const int64_t temp[4] {Get<int64_t, DEF, INDICES, 4>(values)...};
return V256<T> {
simde_mm256_load_si256(reinterpret_cast<const simde__m256i*>(temp))
};
return V256<T> {simde_mm256_load_si256(reinterpret_cast<const simde__m256i*>(temp))};
#else
return V256<T> {
simde_mm256_setr_epi64x(Get<int64_t, DEF, INDICES, 4>(values)...)
};
return V256<T> {simde_mm256_setr_epi64x(Get<int64_t, DEF, INDICES, 4>(values)...)};
#endif
}
else if constexpr (CT::Float<T>) return V256<T> {simde_mm256_setr_ps(Get<simde_float32, DEF, INDICES, 8>(values)...)};
Expand All @@ -113,13 +106,12 @@ namespace Langulus::SIMD
#if LANGULUS_SIMD(512BIT)
if constexpr (CHUNK == 64) {
LANGULUS_SIMD_VERBOSE("Setting 512bit register from ", CountOf<FROM>, " elements");

if constexpr (CT::Integer8<T>) return V512<T> {simde_mm512_setr_epi8 (Get<int8_t, DEF, INDICES, 64>(values)...)};
if constexpr (CT::Integer8<T>) return V512<T> {simde_mm512_setr_epi8 (Get<int8_t, DEF, INDICES, 64>(values)...)};
else if constexpr (CT::Integer16<T>) return V512<T> {simde_mm512_setr_epi16(Get<int16_t, DEF, INDICES, 32>(values)...)};
else if constexpr (CT::Integer32<T>) return V512<T> {simde_mm512_setr_epi32(Get<int32_t, DEF, INDICES, 16>(values)...)};
else if constexpr (CT::Integer64<T>) return V512<T> {simde_mm512_setr_epi64(Get<int64_t, DEF, INDICES, 8 >(values)...)};
else if constexpr (CT::Integer64<T>) return V512<T> {simde_mm512_setr_epi64(Get<int64_t, DEF, INDICES, 8>(values)...)};
else if constexpr (CT::Float<T>) return V512<T> {simde_mm512_setr_ps (Get<simde_float32, DEF, INDICES, 16>(values)...)};
else if constexpr (CT::Double<T>) return V512<T> {simde_mm512_setr_pd (Get<simde_float64, DEF, INDICES, 8 >(values)...)};
else if constexpr (CT::Double<T>) return V512<T> {simde_mm512_setr_pd (Get<simde_float64, DEF, INDICES, 8>(values)...)};
else LANGULUS_ERROR("Can't set 64-byte package");
}
else
Expand All @@ -141,11 +133,8 @@ namespace Langulus::SIMD
auto Set(const FROM& values) noexcept {
using T = TypeOf<FROM>;
constexpr auto S = CountOf<FROM>;
constexpr auto MaxS = CHUNK / sizeof(Decay<T>);
static_assert((CT::Dense<T> and MaxS > S)
or (CT::Sparse<T> and MaxS >= S),
"S should be smaller (or equal if sparse) than MaxS - use load otherwise");

constexpr auto MaxS = CHUNK / sizeof(T);
static_assert(MaxS > S, "S should be smaller than MaxS - use load otherwise");
return Inner::Set<DEF, CHUNK>(Sequence<MaxS>::Expand, values);
}

Expand Down
Loading

0 comments on commit 06de125

Please sign in to comment.