diff --git a/README.md b/README.md index 352e6b68..896b09cf 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,9 @@ Please install libnuma package: git checkout # Please make sure torch is installed when run python example mkdir build && cd build + # Notice: use gcc-13 or higher cmake .. + # If you see the error "numa.h: No such file or directory", install libnuma first, then build with "CPATH=$CONDA_PATH/include/:$CPATH make -j". make -j ``` - Using `python setup.py` diff --git a/README_CN.md b/README_CN.md index 12e4b63e..362ea9c4 100644 --- a/README_CN.md +++ b/README_CN.md @@ -154,7 +154,9 @@ docker run -it \ git checkout # 如果使用python示例,请确保已经安装torch。 mkdir build && cd build + # 注意使用gcc-13及以上版本 cmake .. + # 若遇到错误 "numa.h: No such file or directory",需要先安装numa包,然后使用 "CPATH=$CONDA_PATH/include/:$CPATH make -j"完成编译 make -j ``` - 使用 `python setup.py` diff --git a/cmake/xdnn.cmake b/cmake/xdnn.cmake index 6311506e..da576d21 100644 --- a/cmake/xdnn.cmake +++ b/cmake/xdnn.cmake @@ -26,8 +26,8 @@ include(ExternalProject) # cmake-format: off ExternalProject_Add(xdnn_lib - URL https://github.com/intel/xFasterTransformer/releases/download/IntrinsicGemm/xdnn_v1.5.7.tar.gz - URL_HASH MD5=6cad71df05ef120e058bce28a0a478a8 + URL https://github.com/intel/xFasterTransformer/releases/download/IntrinsicGemm/xdnn_v1.5.9.tar.gz + URL_HASH MD5=3aa9cd15df3eb2a7a1c178f3edcf9d37 TIMEOUT 120 SOURCE_DIR ${CMAKE_SOURCE_DIR}/3rdparty/xdnn CONFIGURE_COMMAND "" diff --git a/src/utils/matmul_helper.h b/src/utils/matmul_helper.h index 9235e3b4..17e96254 100644 --- a/src/utils/matmul_helper.h +++ b/src/utils/matmul_helper.h @@ -524,12 +524,12 @@ class MMHelper { // E4M3 else if constexpr (std::is_same_v) { - int amx_rows = (int)((K + 15) / 16) * 16; - int amx_cols = (int)((N + 63) / 64) * 64; - if (!weight.isShadow()) weight.Resize(amx_rows, amx_cols); - memset(weight.Data(), 0, sizeof(e4m3_t) * amx_rows * amx_cols); + int packBlkSize = 32; + size_t pack_size = xdnn_small_amx_sgemm_bf16f8bf16_packb_size(K, N, packBlkSize); + if (!weight.isShadow()) weight.Resize((pack_size + N - 1) / N, N); + memset(weight.Data(), 0, sizeof(e4m3_t) * pack_size); xdnn_small_amx_sgemm_bf16f8bf16_packb(trans, N, K, (const XDNN_E4M3 *)src.Data(), src.Stride(), - (XDNN_E4M3 *)weight.Data(), 64); + (XDNN_E4M3 *)weight.Data(), packBlkSize); } } @@ -691,7 +691,7 @@ class MMHelper { // E4M3 else if constexpr (std::is_same_v) { - if (M <= 16) { + if (true) { assert(blockSize == 128); if (lds == -1) lds = (K + 127) / 128; GEMMVERBOSE("xdnn_gemm_bf16f8bf16_compute", @@ -1509,7 +1509,7 @@ class MMHelper { // E4M3 else if constexpr (std::is_same_v) { - if (M <= 16) { + if (true) { assert(blockSize == 128); if (lds == -1) lds = (K + 127) / 128; GEMMVERBOSE("xdnn_gemm_bf16f8bf16_compute_residential",