diff --git a/src_c/alphablit.c b/src_c/alphablit.c index f428dd1bac..7174713154 100644 --- a/src_c/alphablit.c +++ b/src_c/alphablit.c @@ -24,7 +24,6 @@ #define NO_PYGAME_C_API #include "_surface.h" -#include "simd_shared.h" #include "simd_blitters.h" static void @@ -57,6 +56,12 @@ blit_blend_rgba_max(SDL_BlitInfo *info); static void blit_blend_premultiplied(SDL_BlitInfo *info); +void +premul_surf_color_by_alpha_non_simd(SDL_Surface *src, + PG_PixelFormat *src_format, + SDL_Palette *src_palette, SDL_Surface *dst, + PG_PixelFormat *dst_format, + SDL_Palette *dst_palette); static int SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, @@ -164,8 +169,7 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, case 0: { if (info.src_blend != SDL_BLENDMODE_NONE && info.src->Amask) { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -174,7 +178,8 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, /* If our source and destination are the same ARGB 32bit format we can use SSE2/NEON/AVX2 to speed up the blend */ - if (pg_has_avx2() && (src != dst)) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2() && (src != dst)) { if (info.src_blanket_alpha != 255) { alphablit_alpha_avx2_argb_surf_alpha( &info); @@ -192,8 +197,9 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, } break; } -#if PG_ENABLE_SSE_NEON - if ((pg_HasSSE_NEON()) && (src != dst)) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if ((PG_HAS_SSE2_OR_NEON()) && (src != dst)) { if (info.src_blanket_alpha != 255) { alphablit_alpha_sse2_argb_surf_alpha( &info); @@ -211,10 +217,9 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, } break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ alphablit_alpha(&info); } else if (info.src_has_colorkey) { @@ -226,8 +231,8 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, break; } case PYGAME_BLEND_ADD: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -235,11 +240,12 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && !(info.src->Amask != 0 && info.dst->Amask != 0 && info.src->Amask != info.dst->Amask) && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_rgb_add_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -247,19 +253,18 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && !(info.src->Amask != 0 && info.dst->Amask != 0 && info.src->Amask != info.dst->Amask) && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_rgb_add_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_add(&info); break; } case PYGAME_BLEND_SUB: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -267,11 +272,12 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && !(info.src->Amask != 0 && info.dst->Amask != 0 && info.src->Amask != info.dst->Amask) && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_rgb_sub_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -279,19 +285,18 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && !(info.src->Amask != 0 && info.dst->Amask != 0 && info.src->Amask != info.dst->Amask) && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_rgb_sub_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_sub(&info); break; } case PYGAME_BLEND_MULT: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -299,11 +304,12 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && !(info.src->Amask != 0 && info.dst->Amask != 0 && info.src->Amask != info.dst->Amask) && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_rgb_mul_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -311,19 +317,18 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && !(info.src->Amask != 0 && info.dst->Amask != 0 && info.src->Amask != info.dst->Amask) && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_rgb_mul_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_mul(&info); break; } case PYGAME_BLEND_MIN: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -331,11 +336,12 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && !(info.src->Amask != 0 && info.dst->Amask != 0 && info.src->Amask != info.dst->Amask) && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_rgb_min_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -343,19 +349,18 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && !(info.src->Amask != 0 && info.dst->Amask != 0 && info.src->Amask != info.dst->Amask) && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_rgb_min_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_min(&info); break; } case PYGAME_BLEND_MAX: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -363,11 +368,12 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && !(info.src->Amask != 0 && info.dst->Amask != 0 && info.src->Amask != info.dst->Amask) && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_rgb_max_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -375,181 +381,181 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && !(info.src->Amask != 0 && info.dst->Amask != 0 && info.src->Amask != info.dst->Amask) && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_rgb_max_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_max(&info); break; } case PYGAME_BLEND_RGBA_ADD: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_rgba_add_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_rgba_add_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_rgba_add(&info); break; } case PYGAME_BLEND_RGBA_SUB: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_rgba_sub_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_rgba_sub_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_rgba_sub(&info); break; } case PYGAME_BLEND_RGBA_MULT: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_rgba_mul_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_rgba_mul_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_rgba_mul(&info); break; } case PYGAME_BLEND_RGBA_MIN: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_rgba_min_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_rgba_min_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_rgba_min(&info); break; } case PYGAME_BLEND_RGBA_MAX: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_rgba_max_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_rgba_max_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_rgba_max(&info); break; } case PYGAME_BLEND_PREMULTIPLIED: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_AVX2 if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && info.src->Gmask == info.dst->Gmask && info.src->Bmask == info.dst->Bmask && info.src_blend != SDL_BLENDMODE_NONE && - pg_has_avx2() && (src != dst)) { + PG_HAS_AVX2() && (src != dst)) { blit_blend_premultiplied_avx2(&info); break; } -#if PG_ENABLE_SSE_NEON +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON if (PG_SURF_BytesPerPixel(src) == 4 && PG_SURF_BytesPerPixel(dst) == 4 && info.src->Rmask == info.dst->Rmask && @@ -557,13 +563,12 @@ SoftBlitPyGame(SDL_Surface *src, SDL_Rect *srcrect, SDL_Surface *dst, info.src->Bmask == info.dst->Bmask && info.src->Amask == 0xFF000000 && info.src_blend != SDL_BLENDMODE_NONE && - pg_HasSSE_NEON() && (src != dst)) { + PG_HAS_SSE2_OR_NEON() && (src != dst)) { blit_blend_premultiplied_sse2(&info); break; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ blit_blend_premultiplied(&info); break; @@ -2958,26 +2963,20 @@ premul_surf_color_by_alpha(SDL_Surface *src, SDL_Surface *dst) if (src_blend == SDL_BLENDMODE_NONE && !(src_format->Amask != 0)) return -1; // since we know dst is a copy of src we can simplify the normal checks -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN - if ((PG_SURF_BytesPerPixel(src) == 4) && pg_has_avx2()) { +#ifndef PG_SIMD_DISABLED +#if PG_HAS_AVX2 + if ((PG_SURF_BytesPerPixel(src) == 4) && PG_HAS_AVX2()) { premul_surf_color_by_alpha_avx2(src, dst); return 0; } -#if defined(__SSE2__) - if ((PG_SURF_BytesPerPixel(src) == 4) && SDL_HasSSE2()) { - premul_surf_color_by_alpha_sse2(src, dst); - return 0; - } -#endif /* __SSE2__*/ -#if PG_ENABLE_ARM_NEON - if ((PG_SURF_BytesPerPixel(src) == 4) && SDL_HasNEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if ((PG_SURF_BytesPerPixel(src) == 4) && PG_HAS_SSE2_OR_NEON()) { premul_surf_color_by_alpha_sse2(src, dst); return 0; } -#endif /* PG_ENABLE_ARM_NEON */ -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ premul_surf_color_by_alpha_non_simd(src, src_format, src_palette, dst, dst_format, dst_palette); return 0; diff --git a/src_c/simd_blitters.h b/src_c/simd_blitters.h index 0c02207c26..0996271885 100644 --- a/src_c/simd_blitters.h +++ b/src_c/simd_blitters.h @@ -1,14 +1,9 @@ #define NO_PYGAME_C_API #include "_surface.h" #include "_blit_info.h" +#include "simd_shared.h" -#if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__) -// arm64 has neon optimisations enabled by default, even when fpu=neon is not -// passed -#define PG_ENABLE_ARM_NEON 1 -#endif - -#if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) +#ifdef PG_HAS_SSE2_OR_NEON void alphablit_alpha_sse2_argb_surf_alpha(SDL_BlitInfo *info); void @@ -37,26 +32,11 @@ void blit_blend_rgb_min_sse2(SDL_BlitInfo *info); void blit_blend_premultiplied_sse2(SDL_BlitInfo *info); -#endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */ - -/* Deliberately putting these outside of the preprocessor guards as I want to - move to a system of trusting the runtime checks to head to the right - function and having a fallback function there if pygame is not compiled - with the right stuff (this is the strategy used for AVX2 right now. - Potentially I might want to shift both these into a slightly different - file as they are not exactly blits (though v. similar) - or I could rename - the SIMD trilogy of files to replace the word blit with something more - generic like surface_ops*/ - -void -premul_surf_color_by_alpha_non_simd(SDL_Surface *src, - PG_PixelFormat *src_format, - SDL_Palette *src_palette, SDL_Surface *dst, - PG_PixelFormat *dst_format, - SDL_Palette *dst_palette); void premul_surf_color_by_alpha_sse2(SDL_Surface *src, SDL_Surface *dst); +#endif /* PG_HAS_SSE2_OR_NEON */ +#ifdef PG_HAS_AVX2 void alphablit_alpha_avx2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info); void @@ -87,3 +67,4 @@ void blit_blend_premultiplied_avx2(SDL_BlitInfo *info); void premul_surf_color_by_alpha_avx2(SDL_Surface *src, SDL_Surface *dst); +#endif diff --git a/src_c/simd_blitters_avx2.c b/src_c/simd_blitters_avx2.c index d113b2b87e..c2e4bf9340 100644 --- a/src_c/simd_blitters_avx2.c +++ b/src_c/simd_blitters_avx2.c @@ -1,44 +1,16 @@ #include "simd_blitters.h" -#if defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) -#include <immintrin.h> -#endif /* defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#define BAD_AVX2_FUNCTION_CALL \ - printf( \ - "Fatal Error: Attempted calling an AVX2 function when both compile " \ - "time and runtime support is missing. If you are seeing this " \ - "message, you have stumbled across a pygame bug, please report it " \ - "to the devs!"); \ - PG_EXIT(1) - -/* helper function that does a runtime check for AVX2. It has the added - * functionality of also returning 0 if compile time support is missing */ -int -pg_has_avx2() -{ -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) - return SDL_HasAVX2(); -#else - return 0; -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ -} - /* This returns 1 when avx2 is available at runtime but support for it isn't * compiled in, 0 in all other cases */ int pg_avx2_at_runtime_but_uncompiled() { if (SDL_HasAVX2()) { -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) +#ifdef PG_HAS_AVX2 return 0; #else return 1; -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ +#endif /* PG_HAS_AVX2 */ } return 0; } @@ -190,8 +162,7 @@ pg_avx2_at_runtime_but_uncompiled() _mm256_srli_epi16( \ _mm256_mulhi_epu16(MM256I, _mm256_set1_epi16((short)0x8081)), 7); -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) +#ifdef PG_HAS_AVX2 void alphablit_alpha_avx2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) { @@ -258,17 +229,7 @@ alphablit_alpha_avx2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) * surfaces. */ pixels_dst = _mm256_and_si256(pixels_dst, mask_out_alpha);) } -#else -void -alphablit_alpha_avx2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void alphablit_alpha_avx2_argb_no_surf_alpha(SDL_BlitInfo *info) { @@ -324,17 +285,7 @@ alphablit_alpha_avx2_argb_no_surf_alpha(SDL_BlitInfo *info) shuff_dst = _mm256_blendv_epi8(shuff_dst, new_dst_alpha, combine_rgba_mask);)) } -#else -void -alphablit_alpha_avx2_argb_no_surf_alpha(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void alphablit_alpha_avx2_argb_surf_alpha(SDL_BlitInfo *info) { @@ -406,17 +357,6 @@ alphablit_alpha_avx2_argb_surf_alpha(SDL_BlitInfo *info) shuff_dst = _mm256_blendv_epi8(shuff_dst, new_dst_alpha, combine_rgba_mask);)) } -#else -void -alphablit_alpha_avx2_argb_surf_alpha(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_rgba_mul_avx2(SDL_BlitInfo *info) { @@ -524,17 +464,6 @@ blit_blend_rgba_mul_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_rgba_mul_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_rgb_mul_avx2(SDL_BlitInfo *info) { @@ -653,17 +582,6 @@ blit_blend_rgb_mul_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_rgb_mul_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_rgba_add_avx2(SDL_BlitInfo *info) { @@ -725,17 +643,6 @@ blit_blend_rgba_add_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_rgba_add_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_rgb_add_avx2(SDL_BlitInfo *info) { @@ -805,17 +712,6 @@ blit_blend_rgb_add_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_rgb_add_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_rgba_sub_avx2(SDL_BlitInfo *info) { @@ -877,17 +773,6 @@ blit_blend_rgba_sub_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_rgba_sub_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_rgb_sub_avx2(SDL_BlitInfo *info) { @@ -957,17 +842,6 @@ blit_blend_rgb_sub_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_rgb_sub_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_rgba_max_avx2(SDL_BlitInfo *info) { @@ -1029,17 +903,6 @@ blit_blend_rgba_max_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_rgba_max_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_rgb_max_avx2(SDL_BlitInfo *info) { @@ -1109,17 +972,6 @@ blit_blend_rgb_max_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_rgb_max_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_rgba_min_avx2(SDL_BlitInfo *info) { @@ -1181,17 +1033,6 @@ blit_blend_rgba_min_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_rgba_min_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_rgb_min_avx2(SDL_BlitInfo *info) { @@ -1261,17 +1102,6 @@ blit_blend_rgb_min_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_rgb_min_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void blit_blend_premultiplied_avx2(SDL_BlitInfo *info) { @@ -1521,14 +1351,6 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info) dstp = (Uint32 *)dstp256 + dstskip; } } -#else -void -blit_blend_premultiplied_avx2(SDL_BlitInfo *info) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ #define PREMUL_ALPHA_CODE \ /* extract the alpha */ \ @@ -1558,8 +1380,6 @@ blit_blend_premultiplied_avx2(SDL_BlitInfo *info) /*add the original alpha back in*/ \ mm_dst = _mm256_or_si256(mm_dst, mm_alpha_in); -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) void premul_surf_color_by_alpha_avx2(SDL_Surface *src, SDL_Surface *dst) { @@ -1635,11 +1455,4 @@ premul_surf_color_by_alpha_avx2(SDL_Surface *src, SDL_Surface *dst) dstp += dst_skip; } } -#else -void -premul_surf_color_by_alpha_avx2(SDL_Surface *src, SDL_Surface *dst) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ -!defined(SDL_DISABLE_IMMINTRIN_H) */ +#endif /* PG_HAS_AVX2 */ diff --git a/src_c/simd_blitters_sse2.c b/src_c/simd_blitters_sse2.c index 65bb926721..ae5e2bbb95 100644 --- a/src_c/simd_blitters_sse2.c +++ b/src_c/simd_blitters_sse2.c @@ -1,24 +1,5 @@ #include "simd_blitters.h" -#if PG_ENABLE_ARM_NEON -// sse2neon.h is from here: https://github.com/DLTcollab/sse2neon -#include "include/sse2neon.h" -#endif /* PG_ENABLE_ARM_NEON */ - -/* See if we are compiled 64 bit on GCC or MSVC */ -#if _WIN32 || _WIN64 -#if _WIN64 -#define ENV64BIT -#endif -#endif - -// Check GCC -#if __GNUC__ -#if __x86_64__ || __ppc64__ || __aarch64__ -#define ENV64BIT -#endif -#endif - /* This returns 1 when sse2 is available at runtime but support for it isn't * compiled in, 0 in all other cases */ int @@ -139,7 +120,7 @@ pg_neon_at_runtime_but_uncompiled() dstp = (Uint32 *)dstp128 + dstskip; \ } -#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) +#ifdef PG_HAS_SSE2_OR_NEON void alphablit_alpha_sse2_argb_surf_alpha(SDL_BlitInfo *info) { @@ -959,4 +940,4 @@ blit_blend_rgba_max_sse2(SDL_BlitInfo *info) SETUP_SSE2_BLITTER RUN_SSE2_BLITTER({ mm128_dst = _mm_max_epu8(mm128_dst, mm128_src); }) } -#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/ +#endif /* PG_HAS_SSE2_OR_NEON */ diff --git a/src_c/simd_fill.h b/src_c/simd_fill.h index db80008c1d..35b605efdd 100644 --- a/src_c/simd_fill.h +++ b/src_c/simd_fill.h @@ -1,44 +1,9 @@ #define NO_PYGAME_C_API #include "_surface.h" - -#if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__) -// arm64 has neon optimisations enabled by default, even when fpu=neon is not -// passed -#define PG_ENABLE_ARM_NEON 1 -#endif - -/* See if we are compiled 64 bit on GCC or MSVC */ -#if _WIN32 || _WIN64 -#if _WIN64 -#define ENV64BIT -#endif -#endif - -// Check GCC -#if __GNUC__ -#if __x86_64__ || __ppc64__ || __aarch64__ -#define ENV64BIT -#endif -#endif - -#if defined(__SSE2__) -#define PG_ENABLE_SSE_NEON 1 -#elif PG_ENABLE_ARM_NEON -#define PG_ENABLE_SSE_NEON 1 -#else -#define PG_ENABLE_SSE_NEON 0 -#endif - -int -_pg_has_avx2(); - -/* This returns True if either SSE2 or NEON is present at runtime. - * Relevant because they use the same codepaths. Only the relevant runtime - * SDL cpu feature check is compiled in.*/ -int -_pg_HasSSE_NEON(); +#include "simd_shared.h" // AVX2 functions +#ifdef PG_HAS_AVX2 int surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color); @@ -70,7 +35,10 @@ surface_fill_blend_max_avx2(SDL_Surface *surface, SDL_Rect *rect, int surface_fill_blend_rgba_max_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color); +#endif + // SSE2 functions +#ifdef PG_HAS_SSE2_OR_NEON int surface_fill_blend_add_sse2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color); @@ -101,3 +69,4 @@ surface_fill_blend_max_sse2(SDL_Surface *surface, SDL_Rect *rect, int surface_fill_blend_rgba_max_sse2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color); +#endif diff --git a/src_c/simd_shared.h b/src_c/simd_shared.h index a697fffb0c..ca7e9a76ab 100644 --- a/src_c/simd_shared.h +++ b/src_c/simd_shared.h @@ -10,8 +10,6 @@ int pg_neon_at_runtime_but_uncompiled(); int pg_avx2_at_runtime_but_uncompiled(); -int -pg_has_avx2(); #if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__) // arm64 has neon optimisations enabled by default, even when fpu=neon is not @@ -33,31 +31,27 @@ pg_has_avx2(); #endif #endif -/* This defines PG_ENABLE_SSE_NEON as True if either SSE or NEON is available - * at compile time. Since we do compile time translation of SSE2->NEON, they - * have the same code paths, so this reduces code duplication of those paths. - */ -#if defined(__SSE2__) -#define PG_ENABLE_SSE_NEON 1 -#elif PG_ENABLE_ARM_NEON -#define PG_ENABLE_SSE_NEON 1 -#else -#define PG_ENABLE_SSE_NEON 0 -#endif +#if defined(__EMSCRIPTEN__) || SDL_BYTEORDER != SDL_LIL_ENDIAN +#define PG_SIMD_DISABLED 1 +#endif /* PG_SIMD_DISABLED */ -/* This returns True if either SSE2 or NEON is present at runtime. - * Relevant because they use the same codepaths. Only the relevant runtime - * SDL cpu feature check is compiled in.*/ -int -pg_HasSSE_NEON() -{ -#if defined(__SSE2__) - return SDL_HasSSE2(); +#ifndef PG_SIMD_DISABLED + +#ifdef __SSE2__ +#define PG_HAS_SSE2_OR_NEON SDL_HasSSE2 #elif PG_ENABLE_ARM_NEON - return SDL_HasNEON(); -#else - return 0; -#endif -} +// sse2neon.h is from here: https://github.com/DLTcollab/sse2neon +#include "include/sse2neon.h" +#define PG_HAS_SSE2_OR_NEON SDL_HasNEON +#endif /* PG_ENABLE_ARM_NEON */ + +#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) +#include <immintrin.h> +#define PG_HAS_AVX2 SDL_HasAVX2 +#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + * !defined(SDL_DISABLE_IMMINTRIN_H) */ + +#endif /* PG_SIMD_DISABLED */ #endif // SIMD_SHARED_H diff --git a/src_c/simd_surface_fill_avx2.c b/src_c/simd_surface_fill_avx2.c index 0c0d27ef21..0426b8f801 100644 --- a/src_c/simd_surface_fill_avx2.c +++ b/src_c/simd_surface_fill_avx2.c @@ -1,31 +1,5 @@ #include "simd_fill.h" -#if defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) -#include <immintrin.h> -#endif /* defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#define BAD_AVX2_FUNCTION_CALL \ - printf( \ - "Fatal Error: Attempted calling an AVX2 function when both compile " \ - "time and runtime support is missing. If you are seeing this " \ - "message, you have stumbled across a pygame bug, please report it " \ - "to the devs!"); \ - PG_EXIT(1) - -/* helper function that does a runtime check for AVX2. It has the added - * functionality of also returning 0 if compile time support is missing */ -int -_pg_has_avx2() -{ -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) - return SDL_HasAVX2(); -#else - return 0; -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ -} - #define SETUP_AVX2_FILLER(COLOR_PROCESS_CODE) \ /* initialize surface data */ \ int width = rect->w, height = rect->h; \ @@ -139,20 +113,6 @@ _pg_has_avx2() return 0; \ } -#define INVALID_DEFS(NAME) \ - int surface_fill_blend_##NAME##_avx2(SDL_Surface *surface, \ - SDL_Rect *rect, Uint32 color) \ - { \ - BAD_AVX2_FUNCTION_CALL; \ - return -1; \ - } \ - int surface_fill_blend_rgba_##NAME##_avx2(SDL_Surface *surface, \ - SDL_Rect *rect, Uint32 color) \ - { \ - BAD_AVX2_FUNCTION_CALL; \ - return -1; \ - } - #define ADD_CODE mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_color); #define SUB_CODE mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_color); #define MIN_CODE mm256_dst = _mm256_min_epu8(mm256_dst, mm256_color); @@ -164,18 +124,10 @@ _pg_has_avx2() shuff_dst = _mm256_srli_epi16(shuff_dst, 8); \ } -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) +#ifdef PG_HAS_AVX2 FILLERS(add, color &= ~amask;, ADD_CODE) FILLERS(sub, color &= ~amask;, SUB_CODE) FILLERS(min, color |= amask;, MIN_CODE) FILLERS(max, color &= ~amask;, MAX_CODE) FILLERS_SHUFF(mult, color |= amask;, MULT_CODE) -#else -INVALID_DEFS(add) -INVALID_DEFS(sub) -INVALID_DEFS(min) -INVALID_DEFS(max) -INVALID_DEFS(mult) -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ +#endif /* PG_HAS_AVX2 */ diff --git a/src_c/simd_surface_fill_sse2.c b/src_c/simd_surface_fill_sse2.c index 7e4a80b030..9f0f23ad55 100644 --- a/src_c/simd_surface_fill_sse2.c +++ b/src_c/simd_surface_fill_sse2.c @@ -1,30 +1,5 @@ #include "simd_fill.h" -#if PG_ENABLE_ARM_NEON -// sse2neon.h is from here: https://github.com/DLTcollab/sse2neon -#include "include/sse2neon.h" -#endif /* PG_ENABLE_ARM_NEON */ - -#define BAD_SSE2_FUNCTION_CALL \ - printf( \ - "Fatal Error: Attempted calling an SSE2 function when both compile " \ - "time and runtime support is missing. If you are seeing this " \ - "message, you have stumbled across a pygame bug, please report it " \ - "to the devs!"); \ - PG_EXIT(1) - -int -_pg_HasSSE_NEON() -{ -#if defined(__SSE2__) - return SDL_HasSSE2(); -#elif PG_ENABLE_ARM_NEON - return SDL_HasNEON(); -#else - return 0; -#endif -} - #define SETUP_SSE2_FILLER(COLOR_PROCESS_CODE) \ /* initialize surface data */ \ int width = rect->w, height = rect->h; \ @@ -130,20 +105,6 @@ _pg_HasSSE_NEON() return 0; \ } -#define INVALID_DEFS(NAME) \ - int surface_fill_blend_##NAME##_sse2(SDL_Surface *surface, \ - SDL_Rect *rect, Uint32 color) \ - { \ - BAD_SSE2_FUNCTION_CALL; \ - return -1; \ - } \ - int surface_fill_blend_rgba_##NAME##_sse2(SDL_Surface *surface, \ - SDL_Rect *rect, Uint32 color) \ - { \ - BAD_SSE2_FUNCTION_CALL; \ - return -1; \ - } - #define ADD_CODE mm128_dst = _mm_adds_epu8(mm128_dst, mm128_color); #define SUB_CODE mm128_dst = _mm_subs_epu8(mm128_dst, mm128_color); #define MIN_CODE mm128_dst = _mm_min_epu8(mm128_dst, mm128_color); @@ -155,16 +116,10 @@ _pg_HasSSE_NEON() shuff_dst = _mm_srli_epi16(shuff_dst, 8); \ } -#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) +#ifdef PG_HAS_SSE2_OR_NEON FILLERS(add, color &= ~amask;, ADD_CODE) FILLERS(sub, color &= ~amask;, SUB_CODE) FILLERS(min, color |= amask;, MIN_CODE) FILLERS(max, color &= ~amask;, MAX_CODE) FILLERS_SHUFF(mult, color |= amask;, MULT_CODE) -#else -INVALID_DEFS(add) -INVALID_DEFS(sub) -INVALID_DEFS(min) -INVALID_DEFS(max) -INVALID_DEFS(mult) -#endif /* defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) */ +#endif /* PG_HAS_SSE2_OR_NEON */ diff --git a/src_c/simd_transform.h b/src_c/simd_transform.h index 45b85f651f..b9882540c6 100644 --- a/src_c/simd_transform.h +++ b/src_c/simd_transform.h @@ -1,6 +1,8 @@ #define NO_PYGAME_C_API #include "_surface.h" +#include "simd_shared.h" + /** * MACRO borrowed from SSE2NEON - useful for making the shuffling family of * intrinsics easier to understand by indicating clearly what will go where. @@ -16,14 +18,8 @@ #define _PG_SIMD_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) -#if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__) -// arm64 has neon optimisations enabled by default, even when fpu=neon is not -// passed -#define PG_ENABLE_ARM_NEON 1 -#endif - // SSE2 functions -#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) +#ifdef PG_HAS_SSE2_OR_NEON void grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf); @@ -43,10 +39,12 @@ filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, void invert_sse2(SDL_Surface *src, SDL_Surface *newsurf); -#endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */ +#endif /* PG_HAS_SSE2_OR_NEON */ // AVX2 functions +#ifdef PG_HAS_AVX2 void grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf); void invert_avx2(SDL_Surface *src, SDL_Surface *newsurf); +#endif /* PG_HAS_AVX2 */ diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index 18c27ac355..ac458fc917 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -1,50 +1,6 @@ #include "simd_transform.h" -#if defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) -#include <immintrin.h> -#endif /* defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) */ - -#define BAD_AVX2_FUNCTION_CALL \ - printf( \ - "Fatal Error: Attempted calling an AVX2 function when both compile " \ - "time and runtime support is missing. If you are seeing this " \ - "message, you have stumbled across a pygame bug, please report it " \ - "to the devs!"); \ - PG_EXIT(1) - -/* helper function that does a runtime check for AVX2. It has the added - * functionality of also returning 0 if compile time support is missing */ -int -pg_has_avx2() -{ -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) - return SDL_HasAVX2(); -#else - return 0; -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ -} - -/* This returns 1 when avx2 is available at runtime but support for it isn't - * compiled in, 0 in all other cases */ -int -pg_avx2_at_runtime_but_uncompiled() -{ - if (SDL_HasAVX2()) { -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) - return 0; -#else - return 1; -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ - } - return 0; -} - -#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) +#ifdef PG_HAS_AVX2 void grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) { @@ -298,16 +254,4 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) srcp256 = (__m256i *)srcp; } } -#else -void -grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) -{ - BAD_AVX2_FUNCTION_CALL; -} -void -invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) -{ - BAD_AVX2_FUNCTION_CALL; -} -#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ - !defined(SDL_DISABLE_IMMINTRIN_H) */ +#endif /* PG_HAS_AVX2 */ diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index 174ee09378..bf916549a5 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -1,41 +1,6 @@ #include "simd_transform.h" -#if PG_ENABLE_ARM_NEON -// sse2neon.h is from here: https://github.com/DLTcollab/sse2neon -#include "include/sse2neon.h" -#endif /* PG_ENABLE_ARM_NEON */ - -/* This returns 1 when sse2 is available at runtime but support for it isn't - * compiled in, 0 in all other cases */ -int -pg_sse2_at_runtime_but_uncompiled() -{ - if (SDL_HasSSE2()) { -#ifdef __SSE2__ - return 0; -#else - return 1; -#endif /* __SSE2__ */ - } - return 0; -} - -/* This returns 1 when neon is available at runtime but support for it isn't - * compiled in, 0 in all other cases */ -int -pg_neon_at_runtime_but_uncompiled() -{ - if (SDL_HasNEON()) { -#if PG_ENABLE_ARM_NEON - return 0; -#else - return 1; -#endif /* PG_ENABLE_ARM_NEON */ - } - return 0; -} - -#if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) +#ifdef PG_HAS_SSE2_OR_NEON // For some reason this is not defined on some non Windows compilers #define _pg_loadu_si32(p) _mm_cvtsi32_si128(*(unsigned int const *)(p)) @@ -686,4 +651,4 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) } } -#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/ +#endif /* PG_HAS_SSE2_OR_NEON */ diff --git a/src_c/surface_fill.c b/src_c/surface_fill.c index 6fa3f984dd..a536be09e3 100644 --- a/src_c/surface_fill.c +++ b/src_c/surface_fill.c @@ -922,215 +922,215 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color, switch (blendargs) { case PYGAME_BLEND_ADD: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(surface) == 4) { - if (_pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { result = surface_fill_blend_add_avx2(surface, rect, color); break; } -#if PG_ENABLE_SSE_NEON - if (_pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { result = surface_fill_blend_add_sse2(surface, rect, color); break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ result = surface_fill_blend_add(surface, rect, color); break; } case PYGAME_BLEND_SUB: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(surface) == 4) { - if (_pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { result = surface_fill_blend_sub_avx2(surface, rect, color); break; } -#if PG_ENABLE_SSE_NEON - if (_pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { result = surface_fill_blend_sub_sse2(surface, rect, color); break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ result = surface_fill_blend_sub(surface, rect, color); break; } case PYGAME_BLEND_MULT: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(surface) == 4) { - if (_pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { result = surface_fill_blend_mult_avx2(surface, rect, color); break; } -#if PG_ENABLE_SSE_NEON - if (_pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { result = surface_fill_blend_mult_sse2(surface, rect, color); break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ result = surface_fill_blend_mult(surface, rect, color); break; } case PYGAME_BLEND_MIN: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(surface) == 4) { - if (_pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { result = surface_fill_blend_min_avx2(surface, rect, color); break; } -#if PG_ENABLE_SSE_NEON - if (_pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { result = surface_fill_blend_min_sse2(surface, rect, color); break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ result = surface_fill_blend_min(surface, rect, color); break; } case PYGAME_BLEND_MAX: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(surface) == 4) { - if (_pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { result = surface_fill_blend_max_avx2(surface, rect, color); break; } -#if PG_ENABLE_SSE_NEON - if (_pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { result = surface_fill_blend_max_sse2(surface, rect, color); break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ result = surface_fill_blend_max(surface, rect, color); break; } case PYGAME_BLEND_RGBA_ADD: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(surface) == 4) { - if (_pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { result = surface_fill_blend_rgba_add_avx2(surface, rect, color); break; } -#if PG_ENABLE_SSE_NEON - if (_pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { result = surface_fill_blend_rgba_add_sse2(surface, rect, color); break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ result = surface_fill_blend_rgba_add(surface, rect, color); break; } case PYGAME_BLEND_RGBA_SUB: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(surface) == 4) { - if (_pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { result = surface_fill_blend_rgba_sub_avx2(surface, rect, color); break; } -#if PG_ENABLE_SSE_NEON - if (_pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { result = surface_fill_blend_rgba_sub_sse2(surface, rect, color); break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ result = surface_fill_blend_rgba_sub(surface, rect, color); break; } case PYGAME_BLEND_RGBA_MULT: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(surface) == 4) { - if (_pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { result = surface_fill_blend_rgba_mult_avx2(surface, rect, color); break; } -#if PG_ENABLE_SSE_NEON - if (_pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { result = surface_fill_blend_rgba_mult_sse2(surface, rect, color); break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ result = surface_fill_blend_rgba_mult(surface, rect, color); break; } case PYGAME_BLEND_RGBA_MIN: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(surface) == 4) { - if (_pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { result = surface_fill_blend_rgba_min_avx2(surface, rect, color); break; } -#if PG_ENABLE_SSE_NEON - if (_pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { result = surface_fill_blend_rgba_min_sse2(surface, rect, color); break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ result = surface_fill_blend_rgba_min(surface, rect, color); break; } case PYGAME_BLEND_RGBA_MAX: { -#if !defined(__EMSCRIPTEN__) -#if SDL_BYTEORDER == SDL_LIL_ENDIAN +#ifndef PG_SIMD_DISABLED if (PG_SURF_BytesPerPixel(surface) == 4) { - if (_pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { result = surface_fill_blend_rgba_max_avx2(surface, rect, color); break; } -#if PG_ENABLE_SSE_NEON - if (_pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { result = surface_fill_blend_rgba_max_sse2(surface, rect, color); break; } -#endif /* PG_ENABLE_SSE_NEON */ +#endif /* PG_HAS_SSE2_OR_NEON */ } -#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ -#endif /* __EMSCRIPTEN__ */ +#endif /* PG_SIMD_DISABLED */ result = surface_fill_blend_rgba_max(surface, rect, color); break; } diff --git a/src_c/transform.c b/src_c/transform.c index ab69290b2d..41ec1e26d4 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -33,7 +33,6 @@ #include <math.h> #include <string.h> -#include "simd_shared.h" #include "simd_transform.h" #include "scale.h" @@ -1255,8 +1254,8 @@ smoothscale_init(struct _module_state *st) return; } -#if !defined(__EMSCRIPTEN__) -#if PG_ENABLE_SSE_NEON +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_SSE2_OR_NEON if (SDL_HasSSE2()) { st->filter_type = "SSE2"; st->filter_shrink_X = filter_shrink_X_SSE2; @@ -1273,8 +1272,8 @@ smoothscale_init(struct _module_state *st) st->filter_expand_Y = filter_expand_Y_SSE2; return; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* !__EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ #ifdef SCALE_MMX_SUPPORT if (SDL_HasSSE()) { st->filter_type = "SSE"; @@ -1655,8 +1654,8 @@ surf_set_smoothscale_backend(PyObject *self, PyObject *args, PyObject *kwargs) "%s not supported on this machine", type); } #endif /* ~defined(SCALE_MMX_SUPPORT) */ -#if !defined(__EMSCRIPTEN__) -#if PG_ENABLE_SSE_NEON +#ifndef PG_SIMD_DISABLED +#ifdef PG_HAS_SSE2_OR_NEON else if (strcmp(type, "SSE2") == 0) { if (!SDL_HasSSE2()) { return RAISE(PyExc_ValueError, @@ -1680,8 +1679,8 @@ surf_set_smoothscale_backend(PyObject *self, PyObject *args, PyObject *kwargs) st->filter_expand_X = filter_expand_X_SSE2; st->filter_expand_Y = filter_expand_Y_SSE2; } -#endif /* PG_ENABLE_SSE_NEON */ -#endif /* !__EMSCRIPTEN__ */ +#endif /* PG_HAS_SSE2_OR_NEON */ +#endif /* PG_SIMD_DISABLED */ else { return PyErr_Format(PyExc_ValueError, "Unknown backend type %s", type); } @@ -2196,31 +2195,30 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) return (SDL_Surface *)(RAISE(pgExc_SDLError, SDL_GetError())); } -#if defined(__EMSCRIPTEN__) - grayscale_non_simd(src, src_format, newsurf, newsurf_format); -#else // !defined(__EMSCRIPTEN__) +#ifndef PG_DISABLE_SIMD if (PG_FORMAT_BytesPerPixel(src_format) == 4 && src_format->Rmask == newsurf_format->Rmask && src_format->Gmask == newsurf_format->Gmask && src_format->Bmask == newsurf_format->Bmask && (src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) { - if (pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { grayscale_avx2(src, newsurf); + goto end; } -#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) - else if (pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { grayscale_sse2(src, newsurf); + goto end; } -#endif // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) - else { - grayscale_non_simd(src, src_format, newsurf, newsurf_format); - } +#endif /* PG_HAS_SSE2_OR_NEON */ } - else { - grayscale_non_simd(src, src_format, newsurf, newsurf_format); - } -#endif // !defined(__EMSCRIPTEN__) +#endif /* PG_DISABLE_SIMD */ + grayscale_non_simd(src, src_format, newsurf, newsurf_format); + goto end; /* Silence warning that the end label is not used. */ +end: SDL_UnlockSurface(newsurf); return newsurf; @@ -4009,31 +4007,30 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) return (SDL_Surface *)(RAISE(pgExc_SDLError, SDL_GetError())); } -#if defined(__EMSCRIPTEN__) - invert_non_simd(src, src_format, newsurf, newsurf_format); -#else // !defined(__EMSCRIPTEN__) +#ifndef PG_DISABLE_SIMD if (PG_FORMAT_BytesPerPixel(src_format) == 4 && src_format->Rmask == newsurf_format->Rmask && src_format->Gmask == newsurf_format->Gmask && src_format->Bmask == newsurf_format->Bmask && (src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) { - if (pg_has_avx2()) { +#ifdef PG_HAS_AVX2 + if (PG_HAS_AVX2()) { invert_avx2(src, newsurf); + goto end; } -#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) - else if (pg_HasSSE_NEON()) { +#endif /* PG_HAS_AVX2 */ +#ifdef PG_HAS_SSE2_OR_NEON + if (PG_HAS_SSE2_OR_NEON()) { invert_sse2(src, newsurf); + goto end; } -#endif // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) - else { - invert_non_simd(src, src_format, newsurf, newsurf_format); - } +#endif /* PG_HAS_SSE2_OR_NEON */ } - else { - invert_non_simd(src, src_format, newsurf, newsurf_format); - } -#endif // !defined(__EMSCRIPTEN__) +#endif // PG_DISABLE_SIMD + invert_non_simd(src, src_format, newsurf, newsurf_format); + goto end; /* Silence warning that the end label is not used. */ +end: SDL_UnlockSurface(newsurf); return newsurf;