Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

granular parallel generic kernel for 64u_byteswap #679

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 26 additions & 26 deletions kernels/volk/volk_64u_byteswap.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,30 @@
#include <inttypes.h>
#include <stdio.h>

#ifdef LV_HAVE_GENERIC
/* Adapted from https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
* Where they reverse the bits in an N-bit word. But who's stoppng me from doing the same
* on byte level?
* Idea is simple: swap the elementary units with half of them "selected" each step, in a
* Hadamard kind of selection.
*/

static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
unsigned int num_points)
{
for (unsigned int point = 0; point < num_points; point++, intsToSwap++) {
uint64_t in = *intsToSwap;
/* swap individual bytes */
in = (in & 0x00FF00FF00FF00FF) << 8 | (in & 0xFF00FF00FF00FF00) >> 8;
/* swap individual shorts */
in = (in & 0x0000FFFF0000FFFF) << 16 | (in & 0xFFFF0000FFFF0000) >> 16;
/* swap the two 32 bit words */
in = (in & 0x00000000FFFFFFFF) << 32 | (in & 0xFFFFFFFF00000000) >> 32;
*intsToSwap = in;
}
}
#endif

#ifdef LV_HAVE_SSE2
#include <emmintrin.h>

Expand Down Expand Up @@ -109,30 +133,6 @@ static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int n
}
#endif /* LV_HAVE_SSE2 */


#ifdef LV_HAVE_GENERIC

static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
unsigned int num_points)
{
uint32_t* inputPtr = (uint32_t*)intsToSwap;
unsigned int point;
for (point = 0; point < num_points; point++) {
uint32_t output1 = *inputPtr;
uint32_t output2 = inputPtr[1];

output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));

output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));

*inputPtr++ = output2;
*inputPtr++ = output1;
}
}
#endif /* LV_HAVE_GENERIC */

#if LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
Expand Down Expand Up @@ -476,8 +476,8 @@ static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,

#ifdef LV_HAVE_GENERIC

static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
unsigned int num_points)
static inline void volk_64u_byteswap_generic_decompose(uint64_t* intsToSwap,
unsigned int num_points)
{
uint32_t* inputPtr = (uint32_t*)intsToSwap;
unsigned int point;
Expand Down
11 changes: 11 additions & 0 deletions kernels/volk/volk_64u_byteswappuppet_64u.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@
#include <string.h>
#include <volk/volk_64u_byteswap.h>

#ifdef LV_HAVE_GENERIC
static inline void volk_64u_byteswappuppet_64u_generic_decompose(uint64_t* output,
uint64_t* intsToSwap,
unsigned int num_points)
{

volk_64u_byteswap_generic_decompose((uint64_t*)intsToSwap, num_points);
memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
}
#endif

#ifdef LV_HAVE_GENERIC
static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output,
uint64_t* intsToSwap,
Expand Down
Loading