From 14c870d98bf7ae9fe8dfb9cd3cf11ec076e7021d Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Tue, 2 Sep 2025 13:45:40 +0800 Subject: [PATCH 01/15] ggml-cpu: stabilise nnpa fp32<->fp16 Signed-off-by: Aaron Teo --- common/common.h | 6 ++++++ ggml/src/ggml-cpu/ggml-cpu.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/common/common.h b/common/common.h index 85b3b879d4536..3300b5b29f7f5 100644 --- a/common/common.h +++ b/common/common.h @@ -312,7 +312,13 @@ struct common_params { enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings + #if defined(GGML_NNPA) || defined(__NNPA__) + // disable Flash Attention on NNPA + // see: https://github.com/ggml-org/llama.cpp/issues/15721 + enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; + #else enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention + #endif struct common_params_sampling sampling; struct common_params_speculative speculative; diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 0d5d3a3440aaf..b99f0b5fb8512 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3219,7 +3219,10 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { float32x4_t v_zero = vec_splats(0.0f); uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0); uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); - vec_xst(v_y, 0, (ggml_fp16_t *)(y + i)); + y[i + 0] = vec_extract(v_y, 0); + y[i + 1] = vec_extract(v_y, 1); + y[i + 2] = vec_extract(v_y, 2); + y[i + 3] = vec_extract(v_y, 3); } #endif for (; i < n; ++i) { From 0cc2017caa818097defde6899d306e66ecfc6ce2 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Tue, 2 Sep 2025 13:59:48 +0800 Subject: [PATCH 02/15] ggml-cpu: enable GGML_NNPA by default Signed-off-by: Aaron Teo --- docs/build-s390x.md | 12 ++++-------- ggml/CMakeLists.txt | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/docs/build-s390x.md b/docs/build-s390x.md index f3cdd63be3ece..338886ca0419f 100644 --- a/docs/build-s390x.md +++ b/docs/build-s390x.md @@ -42,14 +42,14 @@ cmake --build build --config Release -j $(nproc) cmake --build build --config Release -j $(nproc) ``` -- By default, NNPA is disabled by default. To enable it: +- By default, NNPA is enabled when available. To disable it (not recommended): ```bash cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_BLAS=ON \ -DGGML_BLAS_VENDOR=OpenBLAS \ - -DGGML_NNPA=ON + -DGGML_NNPA=OFF cmake --build build --config Release -j $(nproc) ``` @@ -166,7 +166,7 @@ Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (t ### 2. NNPA Vector Intrinsics Acceleration -Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation. +Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation. ### 3. zDNN Accelerator (WIP) @@ -230,10 +230,6 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl CXXFLAGS="-include cstdint" pip3 install -r requirements.txt ``` -5. `-DGGML_NNPA=ON` generates gibberish output - - Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`. - ## Getting Help on IBM Z & LinuxONE 1. **Bugs, Feature Requests** @@ -292,4 +288,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl - 🚫 - acceleration unavailable, will still run using scalar implementation - ❓ - acceleration unknown, please contribute if you can test it yourself -Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025. +Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 2, 2025. diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 96be001f8cb7f..52c9ff51f5fe3 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -132,7 +132,7 @@ option(GGML_RVV "ggml: enable rvv" ON) option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF) option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF) option(GGML_VXE "ggml: enable vxe" ON) -option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877 +option(GGML_NNPA "ggml: enable nnpa" ON) option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM") From 1edd6ed4dc80c7909bdba91c012085607ec96d54 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Tue, 2 Sep 2025 16:22:34 +0800 Subject: [PATCH 03/15] ggml-cpu: switch flash attention disable to ggml-cpu Signed-off-by: Aaron Teo --- common/common.h | 6 ------ ggml/src/ggml-cpu/ggml-cpu.cpp | 9 +++++++++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/common/common.h b/common/common.h index 3300b5b29f7f5..85b3b879d4536 100644 --- a/common/common.h +++ b/common/common.h @@ -312,13 +312,7 @@ struct common_params { enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - #if defined(GGML_NNPA) || defined(__NNPA__) - // disable Flash Attention on NNPA - // see: https://github.com/ggml-org/llama.cpp/issues/15721 - enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED; - #else enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention - #endif struct common_params_sampling sampling; struct common_params_speculative speculative; diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 8dacd36714b4c..0297ae5d1674e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -441,6 +441,15 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st case GGML_OP_OUT_PROD: return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) && src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; + case GGML_OP_FLASH_ATTN_EXT: + case GGML_OP_FLASH_ATTN_BACK: +#if defined(GGML_NNPA) || defined(__NNPA__) + // disable Flash Attention on NNPA + // see: https://github.com/ggml-org/llama.cpp/issues/15721 + return false; +#else + return true; +#endif // GGML_NNPA || __NNPA__ default: return true; } From b8e17f566452c0353601386d2c0cdd18c95ef516 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Tue, 2 Sep 2025 17:52:01 +0800 Subject: [PATCH 04/15] ggml-cpu: disable fattn via ggml Signed-off-by: Aaron Teo --- common/common.h | 2 +- ggml/include/ggml-cpu.h | 2 ++ ggml/src/ggml-cpu/ggml-cpu.c | 8 ++++++++ ggml/src/ggml-cpu/ggml-cpu.cpp | 9 --------- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/common/common.h b/common/common.h index 85b3b879d4536..8a6d30c8e4d50 100644 --- a/common/common.h +++ b/common/common.h @@ -312,7 +312,7 @@ struct common_params { enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention + enum llama_flash_attn_type flash_attn_type = ggml_cpu_support_fattn() ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED; // whether to use Flash Attention struct common_params_sampling sampling; struct common_params_speculative speculative; diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index be40b100979de..c2df1f67bf765 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -105,6 +105,8 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); GGML_BACKEND_API int ggml_cpu_has_llamafile (void); + GGML_BACKEND_API int ggml_cpu_support_fattn (void); // whether Flash Attention is supported + // Internal types and functions exposed for tests and benchmarks typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index b99f0b5fb8512..45a0b305dd0b0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3524,6 +3524,14 @@ int ggml_cpu_has_sme(void) { #endif } +int ggml_cpu_support_fattn(void) { +#if defined(GGML_NNPA) || defined(__NNPA__) + return 0; +#else + return 1; +#endif +} + void ggml_cpu_init(void) { // needed to initialize ggml_time { diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 0297ae5d1674e..8dacd36714b4c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -441,15 +441,6 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st case GGML_OP_OUT_PROD: return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) && src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; - case GGML_OP_FLASH_ATTN_EXT: - case GGML_OP_FLASH_ATTN_BACK: -#if defined(GGML_NNPA) || defined(__NNPA__) - // disable Flash Attention on NNPA - // see: https://github.com/ggml-org/llama.cpp/issues/15721 - return false; -#else - return true; -#endif // GGML_NNPA || __NNPA__ default: return true; } From a59f36238433cafd52092d34f186ad012ff4cebb Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Tue, 2 Sep 2025 19:24:51 +0800 Subject: [PATCH 05/15] ggml-cpu: add comment for fattn disable Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/ggml-cpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 45a0b305dd0b0..d250f60c4607e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3526,6 +3526,8 @@ int ggml_cpu_has_sme(void) { int ggml_cpu_support_fattn(void) { #if defined(GGML_NNPA) || defined(__NNPA__) + // disable Flash Attention when using NNPA + // see: https://github.com/ggml-org/llama.cpp/issues/15721 return 0; #else return 1; From fde523146e5f025cf1efbc580ef203679215747d Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Wed, 3 Sep 2025 02:02:22 +0800 Subject: [PATCH 06/15] ggml-cpu: undo fattn override for nnpa Signed-off-by: Aaron Teo --- common/common.h | 2 +- ggml/include/ggml-cpu.h | 2 -- ggml/src/ggml-cpu/ggml-cpu.c | 10 ---------- 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/common/common.h b/common/common.h index 8a6d30c8e4d50..85b3b879d4536 100644 --- a/common/common.h +++ b/common/common.h @@ -312,7 +312,7 @@ struct common_params { enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - enum llama_flash_attn_type flash_attn_type = ggml_cpu_support_fattn() ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED; // whether to use Flash Attention + enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention struct common_params_sampling sampling; struct common_params_speculative speculative; diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index c2df1f67bf765..be40b100979de 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -105,8 +105,6 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); GGML_BACKEND_API int ggml_cpu_has_llamafile (void); - GGML_BACKEND_API int ggml_cpu_support_fattn (void); // whether Flash Attention is supported - // Internal types and functions exposed for tests and benchmarks typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index d250f60c4607e..b99f0b5fb8512 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3524,16 +3524,6 @@ int ggml_cpu_has_sme(void) { #endif } -int ggml_cpu_support_fattn(void) { -#if defined(GGML_NNPA) || defined(__NNPA__) - // disable Flash Attention when using NNPA - // see: https://github.com/ggml-org/llama.cpp/issues/15721 - return 0; -#else - return 1; -#endif -} - void ggml_cpu_init(void) { // needed to initialize ggml_time { From ed91ef66b368178f18bb97e64e48def5eb845043 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Wed, 3 Sep 2025 02:03:28 +0800 Subject: [PATCH 07/15] ggml-cpu: temp disable faulty fp32<->fp16 conversion Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/simd-mappings.h | 40 +++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index 8bd56bdac1b43..d21729ccaf0cd 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -114,26 +114,26 @@ extern "C" { #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x) #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) -#elif defined(__NNPA__) - #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x) - #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x) - - #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) - #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) - - static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) { - uint16x8_t v_h = vec_splats(h); - uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0); - return vec_extend_to_fp32_hi(v_hd, 0)[0]; - } - - static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) { - float32x4_t v_f = vec_splats(f); - float32x4_t v_zero = vec_splats(0.0f); - uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0); - uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0); - return vec_extract(v_h, 0); - } +// #elif defined(__NNPA__) +// #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x) +// #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x) + +// #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) +// #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) + +// static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) { +// uint16x8_t v_h = vec_splats(h); +// uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0); +// return vec_extend_to_fp32_hi(v_hd, 0)[0]; +// } + +// static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) { +// float32x4_t v_f = vec_splats(f); +// float32x4_t v_zero = vec_splats(0.0f); +// uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0); +// uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0); +// return vec_extract(v_h, 0); +// } #endif // precomputed f32 table for f16 (256 KB) From 4200bead8b777c7e678098755a75c6bce674ca6b Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Wed, 3 Sep 2025 02:10:37 +0800 Subject: [PATCH 08/15] ggml-cpu: disable more faulty code for rework Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/ggml-cpu.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index b99f0b5fb8512..214e58806ffc1 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3206,24 +3206,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); _mm_storel_epi64((__m128i *)(y + i), y_vec); } -#elif defined(__NNPA__) - for (; i + 7 < n; i += 8) { - float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0)); - float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4)); - uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0); - uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); - vec_xst(v_y, 0, (ggml_fp16_t *)(y + i)); - } - for (; i + 3 < n; i += 4) { - float32x4_t v_x = vec_xl(0, (const float *)(x + i)); - float32x4_t v_zero = vec_splats(0.0f); - uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0); - uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); - y[i + 0] = vec_extract(v_y, 0); - y[i + 1] = vec_extract(v_y, 1); - y[i + 2] = vec_extract(v_y, 2); - y[i + 3] = vec_extract(v_y, 3); - } +// #elif defined(__NNPA__) +// for (; i + 7 < n; i += 8) { +// float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0)); +// float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4)); +// uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0); +// uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); +// vec_xst(v_y, 0, (ggml_fp16_t *)(y + i)); +// } +// for (; i + 3 < n; i += 4) { +// float32x4_t v_x = vec_xl(0, (const float *)(x + i)); +// float32x4_t v_zero = vec_splats(0.0f); +// uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0); +// uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); +// y[i + 0] = vec_extract(v_y, 0); +// y[i + 1] = vec_extract(v_y, 1); +// y[i + 2] = vec_extract(v_y, 2); +// y[i + 3] = vec_extract(v_y, 3); +// } #endif for (; i < n; ++i) { y[i] = GGML_CPU_FP32_TO_FP16(x[i]); From 0b3bec8cad66b8d4670765be7e7236e0bd1794ac Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 5 Sep 2025 17:22:59 +0800 Subject: [PATCH 09/15] ggml-cpu: add more logging to detect -inf Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/ggml-cpu.c | 34 +++++++++++++++++++++++----------- ggml/src/ggml-cpu/ops.cpp | 11 ++++++++++- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 214e58806ffc1..917786b822ff2 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3206,7 +3206,7 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); _mm_storel_epi64((__m128i *)(y + i), y_vec); } -// #elif defined(__NNPA__) +#elif defined(__NNPA__) // for (; i + 7 < n; i += 8) { // float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0)); // float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4)); @@ -3214,16 +3214,28 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { // uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); // vec_xst(v_y, 0, (ggml_fp16_t *)(y + i)); // } -// for (; i + 3 < n; i += 4) { -// float32x4_t v_x = vec_xl(0, (const float *)(x + i)); -// float32x4_t v_zero = vec_splats(0.0f); -// uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0); -// uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); -// y[i + 0] = vec_extract(v_y, 0); -// y[i + 1] = vec_extract(v_y, 1); -// y[i + 2] = vec_extract(v_y, 2); -// y[i + 3] = vec_extract(v_y, 3); -// } + for (; i + 3 < n; i += 4) { + if (isnan(x[i + 0]) || x[i + 0] == -INFINITY) GGML_LOG_INFO("%s: x[0]: %10.6f (0x%08x)\n", __func__, x[i + 0], x[i + 0]); + if (isnan(x[i + 1]) || x[i + 1] == -INFINITY) GGML_LOG_INFO("%s: x[1]: %10.6f (0x%08x)\n", __func__, x[i + 1], x[i + 1]); + if (isnan(x[i + 2]) || x[i + 2] == -INFINITY) GGML_LOG_INFO("%s: x[2]: %10.6f (0x%08x)\n", __func__, x[i + 2], x[i + 2]); + if (isnan(x[i + 3]) || x[i + 3] == -INFINITY) GGML_LOG_INFO("%s: x[3]: %10.6f (0x%08x)\n", __func__, x[i + 3], x[i + 3]); + + if (isnan(x[i + 0]) || x[i + 0] == -1.0 / 0.0 || + isnan(x[i + 1]) || x[i + 1] == -1.0 / 0.0 || + isnan(x[i + 2]) || x[i + 2] == -1.0 / 0.0 || + isnan(x[i + 3]) || x[i + 3] == -1.0 / 0.0) { + raise(SIGINT); + } + + float32x4_t v_x = vec_xl(0, (const float *)(x + i)); + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0); + uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); + y[i + 0] = vec_extract(v_y, 0); + y[i + 1] = vec_extract(v_y, 1); + y[i + 2] = vec_extract(v_y, 2); + y[i + 3] = vec_extract(v_y, 3); + } #endif for (; i < n; ++i) { y[i] = GGML_CPU_FP32_TO_FP16(x[i]); diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 8c1f7948855ac..76e93ed6ae477 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -750,8 +750,16 @@ static void ggml_compute_forward_dup_f32( for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const float src_val = *src0_ptr; - dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr); + if (isinf(src_val) && src_val < 0) { + fprintf(stderr, "WARNING: -inf detected in ggml_compute_forward_dup_f32 -> F16\n"); + fprintf(stderr, " Source position: i00=%d, i01=%d, i02=%d, i03=%d\n", (int)i00, (int)i01, (int)i02, (int)i03); + fprintf(stderr, " Linear index: %zu, value: %f\n", id, src_val); + fprintf(stderr, " Thread: %d/%d\n", ith, nth); + } + + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(src_val); id++; } } @@ -4155,6 +4163,7 @@ static void ggml_compute_forward_rms_norm_f32( GGML_ASSERT(eps >= 0.0f); + // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { From b9ce37e570dbdb0a69fe7a8c208045d7e1821781 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 5 Sep 2025 17:25:23 +0800 Subject: [PATCH 10/15] ggml-cpu: wip Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/ggml-cpu.c | 8 ++++---- ggml/src/ggml-cpu/ops.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 917786b822ff2..e5e61d9bf547f 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3220,10 +3220,10 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { if (isnan(x[i + 2]) || x[i + 2] == -INFINITY) GGML_LOG_INFO("%s: x[2]: %10.6f (0x%08x)\n", __func__, x[i + 2], x[i + 2]); if (isnan(x[i + 3]) || x[i + 3] == -INFINITY) GGML_LOG_INFO("%s: x[3]: %10.6f (0x%08x)\n", __func__, x[i + 3], x[i + 3]); - if (isnan(x[i + 0]) || x[i + 0] == -1.0 / 0.0 || - isnan(x[i + 1]) || x[i + 1] == -1.0 / 0.0 || - isnan(x[i + 2]) || x[i + 2] == -1.0 / 0.0 || - isnan(x[i + 3]) || x[i + 3] == -1.0 / 0.0) { + if (isnan(x[i + 0]) || x[i + 0] == -INFINITY || + isnan(x[i + 1]) || x[i + 1] == -INFINITY || + isnan(x[i + 2]) || x[i + 2] == -INFINITY || + isnan(x[i + 3]) || x[i + 3] == -INFINITY) { raise(SIGINT); } diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 76e93ed6ae477..c853cf1918521 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -752,7 +752,7 @@ static void ggml_compute_forward_dup_f32( const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); const float src_val = *src0_ptr; - if (isinf(src_val) && src_val < 0) { + if ((isnan(src_val) || isinf(src_val) || src_val == -INFINITY) && src_val < 0) { fprintf(stderr, "WARNING: -inf detected in ggml_compute_forward_dup_f32 -> F16\n"); fprintf(stderr, " Source position: i00=%d, i01=%d, i02=%d, i03=%d\n", (int)i00, (int)i01, (int)i02, (int)i03); fprintf(stderr, " Linear index: %zu, value: %f\n", id, src_val); From d73c4cd67409d4a611c654306448a1a820814c3b Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 5 Sep 2025 17:30:55 +0800 Subject: [PATCH 11/15] ggml-cpu: log dup_f32 Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/ops.cpp | 233 +++++++++++++++++++++++++++++++++++--- 1 file changed, 220 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index c853cf1918521..0187cbbdaa529 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -750,16 +750,10 @@ static void ggml_compute_forward_dup_f32( for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - const float src_val = *src0_ptr; - if ((isnan(src_val) || isinf(src_val) || src_val == -INFINITY) && src_val < 0) { - fprintf(stderr, "WARNING: -inf detected in ggml_compute_forward_dup_f32 -> F16\n"); - fprintf(stderr, " Source position: i00=%d, i01=%d, i02=%d, i03=%d\n", (int)i00, (int)i01, (int)i02, (int)i03); - fprintf(stderr, " Linear index: %zu, value: %f\n", id, src_val); - fprintf(stderr, " Thread: %d/%d\n", ith, nth); - } + GGML_LOG_INFO("%s: src0_ptr[%d] = %f\n", __func__, i00, *src0_ptr); - dst_ptr[id] = GGML_CPU_FP32_TO_FP16(src_val); + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr); id++; } } @@ -4163,7 +4157,6 @@ static void ggml_compute_forward_rms_norm_f32( GGML_ASSERT(eps >= 0.0f); - // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -7036,6 +7029,209 @@ void ggml_compute_forward_im2col_back_f32( } } + +// ggml_compute_forward_im2col_3d_f16 +// src0: kernel [OC*IC, KD, KH, KW] +// src1: image [N*IC, ID, IH, IW] +// dst: result [N*OD, OH, OW, IC * KD * KH * KW] +static void ggml_compute_forward_im2col_3d_f16( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t s2 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[3]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[4]; + const int32_t p2 = ((const int32_t *)(dst->op_params))[5]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[6]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[7]; + const int32_t d2 = ((const int32_t *)(dst->op_params))[8]; + const int32_t IC = ((const int32_t *)(dst->op_params))[9]; + + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t N = ne13 / IC; + const int64_t ID = ne12; + const int64_t IH = ne11; + const int64_t IW = ne10; + + const int64_t OC = ne03 / IC; + GGML_UNUSED(OC); + const int64_t KD = ne02; + const int64_t KH = ne01; + const int64_t KW = ne00; + + const int64_t OD = ne3 / N; + const int64_t OH = ne2; + const int64_t OW = ne1; + const int64_t OH_OW = OH*OW; + const int64_t KD_KH_KW = KD*KH*KW; + const int64_t KH_KW = KH*KW; + const int64_t IC_KD_KH_KW = IC*KD*KH*KW; + + GGML_ASSERT(nb10 == sizeof(float)); + + // im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW] + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t iod = 0; iod < OD; iod++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic += nth) { + + // micro kernel + ggml_fp16_t * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW] + const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW] + + for (int64_t ikd = 0; ikd < KD; ikd++) { + for (int64_t ikh = 0; ikh < KH; ikh++) { + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + const int64_t iid = iod*s2 + ikd*d2 - p2; + + if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) { + dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0; + } else { + const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW] + dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(*s); + } + } + } + } + } + } + } + } + } + } +} + +// ggml_compute_forward_im2col_3d_f32 +// src0: kernel [OC*IC, KD, KH, KW] +// src1: image [N*IC, ID, IH, IW] +// dst: result [N*OD, OH, OW, IC * KD * KH * KW] +static void ggml_compute_forward_im2col_3d_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t s2 = ((const int32_t *)(dst->op_params))[2]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[3]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[4]; + const int32_t p2 = ((const int32_t *)(dst->op_params))[5]; + const int32_t d0 = ((const int32_t *)(dst->op_params))[6]; + const int32_t d1 = ((const int32_t *)(dst->op_params))[7]; + const int32_t d2 = ((const int32_t *)(dst->op_params))[8]; + const int32_t IC = ((const int32_t *)(dst->op_params))[9]; + + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t N = ne13 / IC; + const int64_t ID = ne12; + const int64_t IH = ne11; + const int64_t IW = ne10; + + const int64_t OC = ne03 / IC; + GGML_UNUSED(OC); + const int64_t KD = ne02; + const int64_t KH = ne01; + const int64_t KW = ne00; + + const int64_t OD = ne3 / N; + const int64_t OH = ne2; + const int64_t OW = ne1; + + const int64_t OH_OW = OH*OW; + const int64_t KD_KH_KW = KD*KH*KW; + const int64_t KH_KW = KH*KW; + const int64_t IC_KD_KH_KW = IC*KD*KH*KW; + + GGML_ASSERT(nb10 == sizeof(float)); + + // im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW] + { + float * const wdata = (float *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t iod = 0; iod < OD; iod++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic += nth) { + + // micro kernel + float * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW] + const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW] + + for (int64_t ikd = 0; ikd < KD; ikd++) { + for (int64_t ikh = 0; ikh < KH; ikh++) { + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + const int64_t iid = iod*s2 + ikd*d2 - p2; + + if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) { + dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0; + } else { + const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW] + dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = *s; + } + } + } + } + } + } + } + } + } + } +} + + +void ggml_compute_forward_im2col_3d( + const ggml_compute_params * params, + ggml_tensor * dst) { + switch (dst->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_im2col_3d_f16(params, dst); + } break; + case GGML_TYPE_F32: + { + ggml_compute_forward_im2col_3d_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k, void * a, void * b, float * c) { const ggml_type_traits * traits = ggml_get_type_traits(type); @@ -8023,6 +8219,15 @@ static void ggml_compute_forward_pad_f32( GGML_TENSOR_UNARY_OP_LOCALS float * dst_ptr = (float *) dst->data; + const int32_t lp0 = ggml_get_op_params_i32(dst, 0); + const int32_t rp0 = ggml_get_op_params_i32(dst, 1); + const int32_t lp1 = ggml_get_op_params_i32(dst, 2); + const int32_t rp1 = ggml_get_op_params_i32(dst, 3); + const int32_t lp2 = ggml_get_op_params_i32(dst, 4); + const int32_t rp2 = ggml_get_op_params_i32(dst, 5); + const int32_t lp3 = ggml_get_op_params_i32(dst, 6); + const int32_t rp3 = ggml_get_op_params_i32(dst, 7); + // TODO: optimize @@ -8031,10 +8236,12 @@ static void ggml_compute_forward_pad_f32( for (int64_t i0 = 0; i0 < ne0; ++i0) { for (int64_t i3 = 0; i3 < ne3; ++i3) { const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0; - - const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + if ((i0 >= lp0 && i0 < ne0 - rp0) \ + && (i1 >= lp1 && i1 < ne1 - rp1) \ + && (i2 >= lp2 && i2 < ne2 - rp2) \ + && (i3 >= lp3 && i3 < ne3 - rp3)) { + const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00; + const float * src_ptr = (const float *)((char *) src0->data + src_idx); dst_ptr[dst_idx] = *src_ptr; } else { dst_ptr[dst_idx] = 0; From 0510f25d971d8250932757250b576b4b5901a710 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 5 Sep 2025 17:38:25 +0800 Subject: [PATCH 12/15] ggml-cpu: wip Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/ops.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 0187cbbdaa529..6907cc54a464a 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -710,6 +710,12 @@ static void ggml_compute_forward_dup_f32( id += rs * ir0; for (int i01 = ir0; i01 < ir1; i01++) { const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + if (isinf(*src0_ptr) || isnan(*src0_ptr) || *src0_ptr == -INFINITY) { + GGML_LOG_INFO("%s: L715: INF detected - src0_ptr = %f, i01=%d, i02=%d, i03=%d, ne00=%lld\n", + __func__, *src0_ptr, i01, i02, i03, (long long)ne00); + } + from_float(src0_ptr, dst_ptr + id, ne00); id += rs; } @@ -751,8 +757,6 @@ static void ggml_compute_forward_dup_f32( for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - GGML_LOG_INFO("%s: src0_ptr[%d] = %f\n", __func__, i00, *src0_ptr); - dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr); id++; } From fac0dbcaf15a0108506325cea9ad8c00c84dbca1 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 5 Sep 2025 17:39:58 +0800 Subject: [PATCH 13/15] ggml-cpu: wip Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/ops.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 6907cc54a464a..51d54590860e9 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -711,10 +711,7 @@ static void ggml_compute_forward_dup_f32( for (int i01 = ir0; i01 < ir1; i01++) { const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - if (isinf(*src0_ptr) || isnan(*src0_ptr) || *src0_ptr == -INFINITY) { - GGML_LOG_INFO("%s: L715: INF detected - src0_ptr = %f, i01=%d, i02=%d, i03=%d, ne00=%lld\n", - __func__, *src0_ptr, i01, i02, i03, (long long)ne00); - } + GGML_LOG_INFO("%s: L714: src0_ptr = %f\n", __func__, *src0_ptr); from_float(src0_ptr, dst_ptr + id, ne00); id += rs; From dc84c2a275a0d1dc4e7e7c176318635719f1f101 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 5 Sep 2025 17:42:49 +0800 Subject: [PATCH 14/15] ggml-cpu: wip Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/ops.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 51d54590860e9..9871e843c6c65 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -711,7 +711,9 @@ static void ggml_compute_forward_dup_f32( for (int i01 = ir0; i01 < ir1; i01++) { const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - GGML_LOG_INFO("%s: L714: src0_ptr = %f\n", __func__, *src0_ptr); + for (size_t za = 0; za < ne00; za++) { + GGML_LOG_INFO("%s: L715: src0_ptr[%zu] = %f\n", __func__, za, src0_ptr[za]); + } from_float(src0_ptr, dst_ptr + id, ne00); id += rs; From f00ecafebf9b5bab2cf5fffe38b49efaec1f1cef Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 5 Sep 2025 17:49:00 +0800 Subject: [PATCH 15/15] ggml-cpu: wip Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/ops.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 9871e843c6c65..10627147cdd3f 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -1,5 +1,7 @@ #include "ops.h" +#include + #include "ggml-cpu.h" #include "ggml-impl.h" #include "binary-ops.h" @@ -713,6 +715,10 @@ static void ggml_compute_forward_dup_f32( for (size_t za = 0; za < ne00; za++) { GGML_LOG_INFO("%s: L715: src0_ptr[%zu] = %f\n", __func__, za, src0_ptr[za]); + if (src0_ptr[za] == -INFINITY) { + GGML_LOG_WARN("%s: L717: WARNING - NEGATIVE INFINITY DETECTED at index %zu! src0_ptr[%zu] = %f\n", __func__, za, za, src0_ptr[za]); + std::raise(SIGINT); + } } from_float(src0_ptr, dst_ptr + id, ne00);