diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b5c16f86..97edf2f5f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,14 +78,15 @@ endif () # set base sources set(PLSSVM_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/Kokkos/execution_space.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/data_parallel_kernels.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/implementation_types.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/kernel_invocation_types.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/stdpar/implementation_types.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/execution_range.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/data_set/min_max_scaler.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/parser_predict.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/parser_scale.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/parser_train.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/io/file_reader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/data_distribution.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/memory_size.cpp @@ -638,6 +639,37 @@ if (PLSSVM_ENABLE_LTO) endif () endif () +######################################################################################################################## +# enable the requested vectorization widths for the auto-vectorizers # +######################################################################################################################## +# GCC and clang both do not automatically auto-vectorize for AVX-512 (only AVX2) +# -> enable it if "cpu:avx512" was passed as PLSSVM_TARGET_PLATFORMS +if (PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1) + if (${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx512") + message(STATUS "Enabling AVX512 support for the auto-vectorizers (-mprefer-vector-width=512).") + target_compile_options( + ${PLSSVM_BASE_LIBRARY_NAME} + PUBLIC $<$:$<$:-mprefer-vector-width=512>> + ) + elseif (${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx2" OR ${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx") + message(STATUS "Enabling AVX/AVX2 support for the auto-vectorizers (-mprefer-vector-width=256).") + target_compile_options( + ${PLSSVM_BASE_LIBRARY_NAME} + PUBLIC $<$:$<$:-mprefer-vector-width=256>> + ) + elseif (${PLSSVM_CPU_TARGET_ARCHS} MATCHES "^sse") + message(STATUS "Enabling SSE for the auto-vectorizers (-mprefer-vector-width=128).") + target_compile_options( + ${PLSSVM_BASE_LIBRARY_NAME} + PUBLIC $<$:$<$:-mprefer-vector-width=128>> + ) + else () + message(FATAL_ERROR "Unrecognized CPU target architecture \"${PLSSVM_CPU_TARGET_ARCHS}\". Allowed values are: avx512, avx2, avx, sse.") + endif () +else () + # automatically use the "optimal" auto-vectorizer width +endif () + ######################################################################################################################## # check for optional and necessary dependencies # ######################################################################################################################## @@ -914,16 +946,16 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME}) choose the SYCL implementation to be used in the SYCL backend: ${PLSSVM_SYCL_BACKEND_NAME_LIST} (default: automatic) " ) - string(REPLACE ";" "|" PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST "${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST}") - set(PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_MANPAGE_ENTRY + string(REPLACE ";" "|" PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST "${PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST}") + set(PLSSVM_SYCL_DATA_PARALLEL_KERNEL_MANPAGE_ENTRY " .TP -.B --sycl_kernel_invocation_type -choose the kernel invocation type when using SYCL as backend: ${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST} (default: automatic) +.B --sycl_data_parallel_kernel +choose the data parallel kernel when using SYCL as backend: ${PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST} (default: automatic) " ) endif () -set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_MANPAGE_ENTRY}${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}") +set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_DATA_PARALLEL_KERNEL_MANPAGE_ENTRY}${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}") # assemble the Kokkos manpage entry if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME}) string(REPLACE ";" "|" PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}") diff --git a/README.md b/README.md index c764ed4d1..cb0efd1a6 100644 --- a/README.md +++ b/README.md @@ -31,38 +31,38 @@ A [Support Vector Machine (SVM)](https://en.wikipedia.org/wiki/Support-vector_machine) is a supervised machine learning model. In its basic form SVMs are used for binary classification tasks. Their fundamental idea is to learn a hyperplane which separates the two classes best, i.e., where the widest possible margin around its decision boundary is free of data. -This is also the reason, why SVMs are also called "large margin classifiers". +This is also the reason, why SVMs are also called "large margin classifiers." To predict to which class a new, unseen data point belongs, the SVM simply has to calculate on which side of the previously calculated hyperplane the data point lies. -This is very efficient since it only involves a single scalar product of the size corresponding to the numer of features of the data set. +This is very efficient since it only involves a single scalar product of the size corresponding to the numer of features per data point in the data set.

Basic idea of an Support Vector Machine as classification model.

-However, normal SVMs suffer in their potential parallelizability. +However, normal SVMs suffer from their potential parallelizability. Determining the hyperplane boils down to solving a convex quadratic problem. For this, most SVM implementations use Sequential Minimal Optimization (SMO), an inherently sequential algorithm. The basic idea of this algorithm is that it takes a pair of data points and calculates the hyperplane between them. Afterward, two new data points are selected and the existing hyperplane is adjusted accordingly. -This procedure is repeat until a new adjustment would be smaller than some epsilon greater than zero. +This procedure is repeated until a new adjustment would be smaller than some epsilon greater than zero. Some SVM implementations try to harness some parallelization potential by not drawing point pairs but group of points. In this case, the hyperplane calculation inside this group is parallelized. -However, even then modern highly parallel hardware can not be utilized efficiently. +However, even then, modern highly parallel hardware cannot be utilized efficiently. Therefore, we implemented a version of the original proposed SVM called [Least Squares Support Vector Machine (LS-SVM)](https://en.wikipedia.org/wiki/Least-squares_support-vector_machine). The LS-SVMs reformulated the original problem such that it boils down to solving a system of linear equations. -For this kind of problem many highly parallel algorithms and implementations are known. +For this kind of problem, many highly parallel algorithms and implementations are known. We decided to use the [Conjugate Gradient (CG)](https://en.wikipedia.org/wiki/Conjugate_gradient_method) to solve the system of linear equations. The main highlights of our SVM implementations are: 1. Drop-in replacement for LIBSVM's `svm-train`, `svm-predict`, and `svm-scale` (some features currently not implemented). -2. Support of multiple different programming frameworks for parallelization (also called backends in our PLSSVM implementation) which allows us to target GPUs and CPUs from different vendors like NVIDIA, AMD, or Intel: +2. Support for multiple different programming frameworks for parallelization (also called backends in our PLSSVM implementation) which allows us to support GPUs and CPUs from different vendors like NVIDIA, AMD, or Intel: - [OpenMP](https://www.openmp.org/) - [HPX](https://hpx.stellar-group.org/) (tested with current master) - C++ 17's standard parallelism [stdpar](https://en.cppreference.com/w/cpp/algorithm):
**Note**: due to the nature of the used USM mechanics in the `stdpar` implementations, the `stdpar` backend **can't** be enabled together with **any** other backend!
- **Note**: since every translation units need to be compiled with the same flag, we currently globally set `CMAKE_CXX_FLAGS` although it's discouraged. + **Note**: since every translation unit needs to be compiled with the same flag, we currently globally set `CMAKE_CXX_FLAGS` although it's discouraged. - [nvc++](https://developer.nvidia.com/hpc-sdk) from NVIDIA's HPC SDK (tested with version [25.3](https://docs.nvidia.com/hpc-sdk/hpc-sdk-release-notes/index.html)) - [roc-stdpar](https://github.com/ROCm/roc-stdpar) merged into upstream LLVM starting with version 18 (tested with version [18](https://releases.llvm.org/)) - [icpx](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler.html) as Intel's oneAPI compiler (tested with version [2025.0.0](https://www.intel.com/content/www/us/en/developer/articles/release-notes/oneapi-dpcpp/2025.html)) @@ -82,8 +82,9 @@ The main highlights of our SVM implementations are: - sigmoid: $\tanh(\gamma$ $\cdot$ $\vec{u}^T$ $\cdot$ $\vec{v}$ $+$ $coef0)$ - laplacian: $\exp(-\gamma$ $\cdot |$ $\vec{u}$ $-$ $\vec{v}$ $|_1)$ - chi-squared (only well-defined for values > 0): $\exp(-\gamma \cdot \sum_i \frac{(x[i] - y[i])^2}{x[i] + y[i]})$ -4. Two different solver types for a trade-off between memory footprint and runtime: - - `cg_explicit`: large memory overhead but very fast +4. Three different solver types for a trade-off between memory footprint and runtime: + - `cg_explicit`: large memory overhead but fast + - `cg_streaming`: the respective runtime automatically handles the memory migrations but may reduce the performance (implemented via unified shared memory) - `cg_implicit`: slower but requires drastically less memory 5. Multi-class classification available via one vs. all (also one vs. rest or OAA) and one vs. one (also OAO): - OAA: one huge classification task where our CG algorithm solves a system of linear equations with multiple right-hand sides. The resulting model file is **not** compatible with LIBSVM. @@ -110,7 +111,7 @@ General dependencies: - [Pybind11 ≥ v2.13.6](https://github.com/pybind/pybind11) if Python bindings are enabled - [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up library utilities (like file parsing) - [MPI](https://www.mpi-forum.org/) if distributed memory systems should be supported; [mpi4py](https://mpi4py.readthedocs.io/en/stable/) to enable interoperability in our Python bindings -- [Format.cmake](https://github.com/TheLartians/Format.cmake) if auto formatting via cmake-format and clang-format is enabled; also requires at least clang-format-18 and git, additionally, needs our custom [cmake-format fork](https://github.com/vancraar/cmake_format) incorporating some patches +- [Format.cmake](https://github.com/TheLartians/Format.cmake) if auto formatting via cmake-format and clang-format is enabled; it also requires at least clang-format-18 and git, additionally, needs our custom [cmake-format fork](https://github.com/vancraar/cmake_format) incorporating some patches - multiple Python modules used in the utility scripts, to install all modules use `pip install --user -r install/python_requirements.txt` Additional dependencies for the OpenMP backend: @@ -297,8 +298,15 @@ The `[optional_options]` can be one or multiple of: - `PLSSVM_ENABLE_FAST_MATH=ON|OFF` (default depending on `CMAKE_BUILD_TYPE`: `ON` for Release or RelWithDebInfo, `OFF` otherwise): enable `fast-math` compiler flags for all backends - `PLSSVM_ENABLE_ASSERTS=ON|OFF` (default: `OFF`): enables custom assertions - `PLSSVM_USE_FLOAT_AS_REAL_TYPE=ON|OFF` (default: `OFF`): use `float` as real_type instead of `double` -- `PLSSVM_THREAD_BLOCK_SIZE` (default: `8`): set a specific thread block size used in the GPU kernels (for fine-tuning optimizations) -- `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `4`): set a specific internal block size used in the GPU kernels (for fine-tuning optimizations) +- `PLSSVM_THREAD_BLOCK_SIZE` (default: `8`): set a specific thread block size used in the kernels (for fine-tuning optimizations)
+ **Note**: for the different execution spaces in the Kokkos backend, the maximum value of the `PLSSVM_THREAD_BLOCK_SIZE` is not as straight forward as one may wish: + - CUDA, HIP, and SYCL: the maximum value depends on the underlying backend (in practice $\sqrt{1024}$ = 32) + - HPX and Serial: must **exactly** be 1 + - OpenMP: must be 1 or 2 (most likely only 1 will work) + - Threads: must be 1; however, note that Kokkos itself **must** be built with hwloc support (via `-DKokkos_ENABLE_HWLOC=ON`), otherwise the Kokkos::Threads execution space will always only use a single core + - OpenMPTarget: $\sqrt{256}$ = 16 + - OpenACC: $\lfloor\sqrt{512}\rfloor$ = 22 +- `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `4`): set a specific internal block size used in the kernels (for fine-tuning optimizations) - `PLSSVM_ENABLE_LTO=ON|OFF` (default: `OFF`): enable interprocedural optimization (IPO/LTO) if supported by the compiler - `PLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=ON|OFF` (default: `ON`): enforce the maximum (device) memory allocation size for the plssvm::solver_type::automatic solver - `PLSSVM_ENABLE_PINNED_MEMORY=ON|OFF` (default: `OFF`): use host pinned memory for the input matrix when assembling the kernel matrix, if available @@ -347,7 +355,7 @@ If the SYCL backend is available, additional options can be set. - `AUTO`: check for DPC++/icpx as implementation for the SYCL backend but **do not** fail if not available - `OFF`: do not check for DPC++/icpx as implementation for the SYCL backend -- `PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS` (default: `ON`): enable SYCL's `hierarchical` and AdaptiveCpp's `scoped` kernel invocation types +- `PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS` (default: `ON`): enable SYCL's `hierarchical` data parallel kernel and AdaptiveCpp's `scoped` parallelism To use DPC++/icpx for SYCL, simply set the `CMAKE_CXX_COMPILER` to the respective DPC++/icpx clang executable during CMake invocation. @@ -355,7 +363,7 @@ If the SYCL implementation is DPC++/icpx the following additional options are av - `PLSSVM_SYCL_BACKEND_DPCPP_USE_LEVEL_ZERO` (default: `ON`): use DPC++/icpx's Level-Zero backend instead of its OpenCL backend **(only available if a CPU or Intel GPU is targeted)** -If the SYCL implementation is AdaptiveCpp the following additional option is available: +If the SYCL implementation is AdaptiveCpp, the following additional option is available: - `PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP` (default: `ON`): use AdaptiveCpp's new SSCP compilation flow @@ -497,7 +505,7 @@ Our `cmake-format` can be installed via: pip install "git+https://github.com/vancraar/cmake_format@master" ``` -To check whether formatting changes must be applied use: +To check whether formatting changes must be applied, one can use: ```bash cmake --build . --target check-cmake-format @@ -519,7 +527,7 @@ If doxygen is installed and `PLSSVM_ENABLE_DOCUMENTATION` is set to `ON` the doc cmake --build . -- doc ``` -The documentation of the current state of the main branch can be found [here](https://sc-sgs.github.io/PLSSVM/). +The documentation of the current main branch can be found [here](https://sc-sgs.github.io/PLSSVM/). ### Installing @@ -550,13 +558,13 @@ export PYTHONPATH=${CMAKE_INSTALL_PREFIX}/lib:${CMAKE_INSTALL_PREFIX}/lib64:${PY #### Install via pip -We also support a pip packages that can be used to install our library: +We also support a pip package that can be used to install our library: ```bash pip install plssvm ``` -This pip install behaves **as if** the CMake `all_python` preset is used. +This pip installation behaves **as if** the CMake `all_python` preset is used. This means that the `PLSSVM_TARGET_PLATFORMS` are automatically determined and PLSSVM is build with all supported backends that available on the target machine at the point of the `pip install plssvm` invocation. To check the installation, including, e.g., the installed backends, we provide the `plssvm-install-check` command after @@ -588,13 +596,17 @@ Issues: https://github.com/SC-SGS/PLSSVM/issues PLSSVM provides three executables: `plssvm-train`, `plssvm-predict`, and `plssvm-scale`. In addition, PLSSVM can also be used as a library in third-party code. -For more information, see the respective `man` pages which are installed via `cmake --build . -- install`. +For more information, see the respective `man` pages which are installed via `cmake --build . -- install`. + +We support the command line options of the third-party libraries [HPX](https://hpx.stellar-group.org/) and [Kokkos](https://github.com/kokkos/kokkos) +by forwarding the command line options to the respective initialization functions. +Internally, these options are filtered out before they are passed to our command line parser utility. ### Generating Artificial Data The repository comes with a Python3 script (in the `utility_scripts/` directory) to simply generate arbitrarily large classification and regression data sets. -In order to use all functionality, the following Python3 modules must be installed: +To use all functionality, the following Python3 modules must be installed: [`argparse`](https://docs.python.org/3/library/argparse.html), [`timeit`](https://docs.python.org/3/library/timeit.html), [`numpy`](https://pypi.org/project/numpy/), [`pandas`](https://pypi.org/project/pandas/), [`sklearn`](https://scikit-learn.org/stable/), [`arff`](https://pypi.org/project/arff/), @@ -643,7 +655,7 @@ optional arguments: ``` -An example invocation generating a classification data set consisting of blobs with 1000 data points with 200 features each and +An example invocation generating a classification data set consisting of blobs with 1000 data points with 200 features and 4 classes could look like: ```bash @@ -682,12 +694,12 @@ Usage: -c, --cost arg set the parameter C (default: 1) -e, --epsilon arg set the tolerance of termination criterion (default: 1e-10) -i, --max_iter arg set the maximum number of CG iterations (default: num_features) - -l, --solver arg choose the solver: automatic|cg_explicit|cg_implicit (default: automatic) + -l, --solver arg choose the solver: automatic|cg_explicit|cg_streaming|cg_implicit (default: automatic) -a, --classification arg the classification strategy to use for multi-class classification: oaa|oao (default: oaa) -b, --backend arg choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic) -p, --target_platform arg choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic) - --sycl_kernel_invocation_type arg - choose the kernel invocation type when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic) + --sycl_data_parallel_kernel arg + choose the data parallel kernel when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic) --sycl_implementation_type arg choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic) --kokkos_execution_space arg @@ -709,7 +721,7 @@ Usage: The help message only print options available based on the CMake invocation. For example, if CUDA was not available during the build step, it will not show up as possible backend in the description of the `--backend` option. -The most minimal example invocation is: +The most minimal example of an invocation is: ```bash ./plssvm-train /path/to/data_file @@ -734,7 +746,7 @@ The `--backend=automatic` option works as follows: - otherwise, if the `gpu_intel` target is available, check for existing backends in order `sycl` 🠦 `opencl` 🠦 `kokkos` 🠦 `stdpar` - otherwise, if the `cpu` target is available, check for existing backends in order `sycl` 🠦 `kokkos` 🠦 `opencl` 🠦 `openmp` 🠦 `hpx` 🠦 `stdpar` -Note that during CMake configuration it is guaranteed that at least one of the above combinations does exist. +Note that during CMake configuration, it is guaranteed that at least one of the above combinations does exist. The `--target_platform=automatic` option works for the different backends as follows: @@ -747,18 +759,19 @@ The `--target_platform=automatic` option works for the different backends as fol - `Kokkos`: checks which execution spaces are available and which target platforms they support and then tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU - `stdpar`: target device must be selected at compile time (using `PLSSVM_TARGET_PLATFORMS`) or using environment variables at runtime -The `--sycl_kernel_invocation_type` and `--sycl_implementation_type` flags are only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`. -If the `--sycl_kernel_invocation_type` is `automatic`, the `work_group` invocation type is currently always used. +The `--sycl_data_parallel_kernel` and `--sycl_implementation_type` flags are only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`. +If the `--sycl_data_parallel_kernel` is `automatic`, the `work_group` data parallel kernels are currently always used. If the `--sycl_implementation_type` is `automatic`, the used SYCL implementation is determined by the `PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` CMake flag. If the `--kokkos_execution_space` is `automatic`, uses the best fitting execution space based on the provided and/or available target platforms. ### Predicting using `plssvm-predict` -Our predict utility is fully conform to LIBSVM's model files. +Our `plssvm-predict` utility is fully conforming to LIBSVM's model files. This means that our `plssvm-predict` can be used on model files learned with, e.g., LIBSVM's `svm-train`. Note: this is not the case for the regression task since the `svm_type` filed mismatch between LIBSVM (`epsilon_svr`) -and PLSSVM (`c_svr`). To automatically convert between the two, simply use the `convert_model.py` script -(in the `utility_scripts/` directory) which simply replaces these fields with the respectively expected one +and PLSSVM (`c_svr`). +To automatically convert between the two, the `convert_model.py` script (in the `utility_scripts/` directory) +can be used which simply replaces these fields with the respectively expected one (note that for large files doing that manually may be faster): ```bash @@ -796,8 +809,8 @@ Usage: -b, --backend arg choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic) -p, --target_platform arg choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic) - --sycl_kernel_invocation_type arg - choose the kernel invocation type when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic) + --sycl_data_parallel_kernel arg + choose the data parallel kernel when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic) --sycl_implementation_type arg choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic) --kokkos_execution_space arg @@ -872,7 +885,7 @@ An example invocation to scale a train and test file in the same way looks like: ### Distributed Memory Support via MPI We support distributed memory via MPI for `plssvm-train` and `plssvm-predict` while simultaneously allowing multiple devices per MPI rank. -In order to use it, MPI must be found during the CMake configuration step. +To use MPI, it must be found during the CMake configuration step. Note that if MPI couldn't be found, PLSSVM still works in shared memory mode only and internally disables all MPI related functionality. For example, to run PLSSVM via MPI on four nodes simply use the normal `mpirun` command: @@ -892,17 +905,17 @@ Note that the number of provided load balancing weights must be equal to the use If one MPI rank has more than one device, all these devices on one MPI rank compute the same number of matrix elements. Our MPI implementation, however, currently has some limitations: -- the training, test, and model data is fully read by **every** MPI rank -- the training, test, and model data is fully stored on **each** compute device on **every** MPI rank +- **every** MPI rank fully reads the training, test, and model data +- **each** compute device on **every** MPI rank fully stores the training, test, and model data - **only** the kernel matrix is really divided across **all** MPI ranks - while the expensive BLAS level 3 operations in the CG algorithm are computed in a distributed way, everything else is computed on **every** MPI rank -- in the CG algorithm we communicate the whole matrix, although it would be sufficient to communicate only matrix parts +- in the CG algorithm we communicate the whole matrix, although it would be enough to communicate only matrix parts - **only** the **main** MPI rank (per default rank 0) writes the output files - `plssvm-scale` **does not** support more than one MPI rank ### Example Code for PLSSVM Used as a Library -A simple C++ program (`main_classification.cpp`) using PLSSVM as library for classification could look like: +A simple C++ program (`main_classification.cpp`) using PLSSVM as a library for classification could look like: ```cpp #include "plssvm/core.hpp" @@ -940,7 +953,7 @@ int main() { const std::vector &correct_label = test_data.labels().value(); std::cout << plssvm::classification_report{ correct_label, predicted_label } << std::endl; - // write model file to disk + // write the model file to disk model.save("model_file.libsvm"); } catch (const plssvm::exception &e) { std::cerr << e.what_with_loc() << std::endl; @@ -952,7 +965,7 @@ int main() { } ``` -A simple C++ program (`main_regression.cpp`) using PLSSVM as library for regression could look like: +A simple C++ program (`main_regression.cpp`) using PLSSVM as a library for regression could look like: ```cpp #include "plssvm/core.hpp" @@ -990,7 +1003,7 @@ int main() { const std::vector &correct_values = test_data.labels().value(); std::cout << plssvm::regression_report{ correct_label, predicted_label } << std::endl; - // write model file to disk + // write the model file to disk model.save("model_file.libsvm"); } catch (const plssvm::exception &e) { std::cerr << e.what_with_loc() << std::endl; @@ -1089,7 +1102,7 @@ plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolors="k") -# generate legend handles and add handle +# generate legend handles legend_handles = [plt.scatter([], [], color=viridis(color), label=f'{label}') for label, color in zip(y_label, np.unique(y))] plt.legend(handles=legend_handles) diff --git a/bindings/Python/README.md b/bindings/Python/README.md index 504d2533b..bcc2c2c4f 100644 --- a/bindings/Python/README.md +++ b/bindings/Python/README.md @@ -321,7 +321,7 @@ The following table lists all PLSSVM enumerations exposed on the Python side: | enumeration | values | description | |------------------------|----------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `TargetPlatform` | `AUTOMATIC`, `CPU`, `GPU_NVIDIA`, `GPU_AMD`, `GPU_INTEL` | The different supported target platforms (default: `AUTOMATIC`). If `AUTOMATIC` is provided, checks for available devices in the following order: NVIDIA GPUs -> AMD GPUs -> Intel GPUs -> CPUs. | -| `SolverType` | `AUTOMATIC`, `CG_EXPLICIT`, `CG_IMPLICIT` | The different supported solver types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the used solver types depends on the available device and system memory. | +| `SolverType` | `AUTOMATIC`, `CG_EXPLICIT`, `CG_STREAMING`, `CG_IMPLICIT` | The different supported solver types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the used solver types depends on the available device and system memory. | | `KernelFunctionType` | `LINEAR`, `POLYNOMIAL`, `RBF`, `SIGMOID`, `LAPLACIAN`, `CHI_SQUARED` | The different supported kernel functions (default: `RBF`). | | `FileFormatType` | `LIBSVM`, `ARFF` | The different supported file format types (default: `LIBSVM`). | | `GammaCoefficientType` | `AUTOMATIC`, `SCALE` | The different modes for the dynamic gamma calculation (default: `AUTOMATIC`). | @@ -332,10 +332,10 @@ The following table lists all PLSSVM enumerations exposed on the Python side: If a SYCL implementation is available, additional enumerations are available: -| enumeration | values | description | -|------------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `ImplementationType` | `AUTOMATIC`, `DPCPP`, `ADAPTIVECPP` | The different supported SYCL implementation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, determines the used SYCL implementation based on the value of `-DPLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` provided during PLSSVM'S build step. | -| `KernelInvocationType` | `AUTOMATIC`, `BASIC`, `WORK_GROUP`, `HIERARCHICAL`, `SCOPED` | The different supported SYCL kernel invocation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, simply uses `WORK_GROUP`. | +| enumeration | values | description | +|----------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `ImplementationType` | `AUTOMATIC`, `DPCPP`, `ADAPTIVECPP` | The different supported SYCL implementation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, determines the used SYCL implementation based on the value of `-DPLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` provided during PLSSVM'S build step. | +| `DataParallelKernel` | `AUTOMATIC`, `BASIC`, `WORK_GROUP`, `HIERARCHICAL`, `SCOPED` | The different supported SYCL data parallel kernels (default: `AUTOMATIC`). If `AUTOMATIC` is provided, simply uses `WORK_GROUP`. | If the stdpar backend is available, an additional enumeration is available: @@ -469,7 +469,7 @@ The following constructors and methods are available for both classification `CS **Note**: if the backend type is `plssvm.BackendType.SYCL` two additional named parameters can be provided: `sycl_implementation_type` to choose between DPC++ and AdaptiveCpp as SYCL implementations -and `sycl_kernel_invocation_type` to choose between the two different SYCL kernel invocation types. +and `sycl_data_parallel_kernel` to choose between the different SYCL data parallel kernels. **Note**: if the backend type is `plssvm.BackendType.HPX` or `plssvm.BackendType.Kokkos` special initialization and finalization functions must be called. @@ -519,12 +519,12 @@ The following constructors and methods are available for both classification `CS | `CSVC(target, *, kernel_type=plssvm.KernelFunctionType.RBF, degree=3, gamma=plssvm.GammaCoefficientType.AUTO, coef0=0.0, cost=1.0, comm=*used MPI communicator*)` | Create a new C-SVM with the provided parameters and named arguments. | In case of the SYCL C-SVMs (`plssvm.sycl.CSVM`, `plssvm.dpcpp.CSVM`, and `plssvm.adaptivecpp.CSVM`; the same for the -`CSVR`s), additionally, all constructors also accept the SYCL specific `sycl_kernel_invocation_type` keyword parameter. +`CSVR`s), additionally, all constructors also accept the SYCL specific `sycl_data_parallel_kernel` keyword parameter. Also, the following method is additional available for the backend specific C-SVM: -| methods | description | -|--------------------------------|-----------------------------------------| -| `get_kernel_invocation_type()` | Return the SYCL kernel invocation type. | +| methods | description | +|------------------------------|--------------------------------------------| +| `get_data_parallel_kernel()` | Return the used SYCL data parallel kernel. | In case of the stdpar C-SVM (`plssvm.stdpar.CSVC` and `plssvm.stdpar.CSVR`) the following method is additional available for the backend specific C-SVM. diff --git a/bindings/Python/backends/adaptivecpp_csvm.cpp b/bindings/Python/backends/adaptivecpp_csvm.cpp index bf43d85f1..f9cc57b3a 100644 --- a/bindings/Python/backends/adaptivecpp_csvm.cpp +++ b/bindings/Python/backends/adaptivecpp_csvm.cpp @@ -6,20 +6,20 @@ * See the LICENSE.md file in the project root for full license information. */ -#include "plssvm/backend_types.hpp" // plssvm::adaptivecpp::backend_csvm_type_t -#include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp" // plssvm::adaptivecpp::csvm -#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::adaptivecpp::backend_exception -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type -#include "plssvm/constants.hpp" // plssvm::real_type -#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception -#include "plssvm/gamma.hpp" // plssvm::gamma -#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type -#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/svm/csvc.hpp" // plssvm::csvc -#include "plssvm/svm/csvm.hpp" // plssvm::csvm -#include "plssvm/svm/csvr.hpp" // plssvm::csvr -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backend_types.hpp" // plssvm::adaptivecpp::backend_csvm_type_t +#include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp" // plssvm::adaptivecpp::csvm +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel +#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::adaptivecpp::backend_exception +#include "plssvm/constants.hpp" // plssvm::real_type +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/gamma.hpp" // plssvm::gamma +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator +#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/svm/csvc.hpp" // plssvm::csvc +#include "plssvm/svm/csvm.hpp" // plssvm::csvm +#include "plssvm/svm/csvr.hpp" // plssvm::csvr +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "bindings/Python/type_caster/mpi_type_caster.hpp" // a custom Pybind11 type caster for a plssvm::mpi::communicator #include "bindings/Python/utility.hpp" // plssvm::bindings::python::util::register_py_exception @@ -49,18 +49,18 @@ void bind_adaptivecpp_csvms(py::module_ &m, const std::string &csvm_name) { const std::string keyword_args_constructor_docstring{ fmt::format("create an AdaptiveCpp SYCL {} with the provided SVM parameter as separate keyword arguments including optional SYCL specific keyword arguments", csvm_name) }; py::class_(m, csvm_name.c_str(), class_docstring.c_str()) - .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) { - return std::make_unique(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation); + .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) { + return std::make_unique(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type); }), params_constructor_docstring.c_str(), py::arg("target") = plssvm::target_platform::automatic, py::kw_only(), py::arg("params") = default_params, - py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic, + py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic, py::arg("comm") = plssvm::mpi::communicator{}) - .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) { + .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) { const plssvm::parameter params{ kernel_type, degree, gamma, coef0, cost }; - return std::make_unique(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation); + return std::make_unique(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type); }), keyword_args_constructor_docstring.c_str(), py::arg("target") = plssvm::target_platform::automatic, @@ -70,11 +70,11 @@ void bind_adaptivecpp_csvms(py::module_ &m, const std::string &csvm_name) { py::arg("gamma") = default_params.gamma, py::arg("coef0") = default_params.coef0, py::arg("cost") = default_params.cost, - py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic, + py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic, py::arg("comm") = plssvm::mpi::communicator{}) - .def("get_kernel_invocation_type", &plssvm::adaptivecpp::csvm::get_kernel_invocation_type, "get the kernel invocation type used in this SYCL C-SVM") + .def("get_data_parallel_kernel", &plssvm::adaptivecpp::csvm::get_data_parallel_kernel, "get the data parallel kernel used in this SYCL C-SVM") .def("__repr__", [csvm_name](const backend_csvm_type &self) { - return fmt::format("", csvm_name, self.num_available_devices(), self.get_kernel_invocation_type()); + return fmt::format("", csvm_name, self.num_available_devices(), self.get_data_parallel_kernel()); }); } diff --git a/bindings/Python/backends/dpcpp_csvm.cpp b/bindings/Python/backends/dpcpp_csvm.cpp index 51dcd7e16..152849bd7 100644 --- a/bindings/Python/backends/dpcpp_csvm.cpp +++ b/bindings/Python/backends/dpcpp_csvm.cpp @@ -6,20 +6,20 @@ * See the LICENSE.md file in the project root for full license information. */ -#include "plssvm/backend_types.hpp" // plssvm::dpcpp::backend_csvm_type_t -#include "plssvm/backends/SYCL/DPCPP/csvm.hpp" // plssvm::dpcpp::csvm -#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::dpcpp::backend_exception -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type -#include "plssvm/constants.hpp" // plssvm::real_type -#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception -#include "plssvm/gamma.hpp" // plssvm::gamma -#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type -#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/svm/csvc.hpp" // plssvm::csvc -#include "plssvm/svm/csvm.hpp" // plssvm::csvm -#include "plssvm/svm/csvr.hpp" // plssvm::csvr -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backend_types.hpp" // plssvm::dpcpp::backend_csvm_type_t +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel +#include "plssvm/backends/SYCL/DPCPP/csvm.hpp" // plssvm::dpcpp::csvm +#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::dpcpp::backend_exception +#include "plssvm/constants.hpp" // plssvm::real_type +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/gamma.hpp" // plssvm::gamma +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator +#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/svm/csvc.hpp" // plssvm::csvc +#include "plssvm/svm/csvm.hpp" // plssvm::csvm +#include "plssvm/svm/csvr.hpp" // plssvm::csvr +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "bindings/Python/type_caster/mpi_type_caster.hpp" // a custom Pybind11 type caster for a plssvm::mpi::communicator #include "bindings/Python/utility.hpp" // plssvm::bindings::python::util::register_py_exception @@ -49,18 +49,18 @@ void bind_dpcpp_csvms(py::module_ &m, const std::string &csvm_name) { const std::string keyword_args_constructor_docstring{ fmt::format("create a DPC++ SYCL {} with the provided SVM parameter as separate keyword arguments including optional SYCL specific keyword arguments", csvm_name) }; py::class_(m, csvm_name.c_str(), class_docstring.c_str()) - .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) { - return std::make_unique(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation); + .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) { + return std::make_unique(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type); }), params_constructor_docstring.c_str(), py::arg("target") = plssvm::target_platform::automatic, py::kw_only(), py::arg("params") = default_params, - py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic, + py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic, py::arg("comm") = plssvm::mpi::communicator{}) - .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) { + .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) { const plssvm::parameter params{ kernel_type, degree, gamma, coef0, cost }; - return std::make_unique(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation); + return std::make_unique(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type); }), keyword_args_constructor_docstring.c_str(), py::arg("target") = plssvm::target_platform::automatic, @@ -70,11 +70,11 @@ void bind_dpcpp_csvms(py::module_ &m, const std::string &csvm_name) { py::arg("gamma") = default_params.gamma, py::arg("coef0") = default_params.coef0, py::arg("cost") = default_params.cost, - py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic, + py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic, py::arg("comm") = plssvm::mpi::communicator{}) - .def("get_kernel_invocation_type", &plssvm::dpcpp::csvm::get_kernel_invocation_type, "get the kernel invocation type used in this SYCL C-SVM") + .def("get_data_parallel_kernel", &plssvm::dpcpp::csvm::get_data_parallel_kernel, "get the data parallel kernel used in this SYCL C-SVM") .def("__repr__", [csvm_name](const backend_csvm_type &self) { - return fmt::format("", csvm_name, self.num_available_devices(), self.get_kernel_invocation_type()); + return fmt::format("", csvm_name, self.num_available_devices(), self.get_data_parallel_kernel()); }); } diff --git a/bindings/Python/backends/sycl.cpp b/bindings/Python/backends/sycl.cpp index 98c27214b..3bf6b6c30 100644 --- a/bindings/Python/backends/sycl.cpp +++ b/bindings/Python/backends/sycl.cpp @@ -6,10 +6,10 @@ * See the LICENSE.md file in the project root for full license information. */ -#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::sycl::backend_exception -#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::{implementation_type, list_available_sycl_implementations} -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type -#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel +#include "plssvm/backends/SYCL/exceptions.hpp" // plssvm::sycl::backend_exception +#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::{implementation_type, list_available_sycl_implementations} +#include "plssvm/exceptions/exceptions.hpp" // plssvm::exception #include "bindings/Python/utility.hpp" // plssvm::bindings::python::util::{register_py_exception, register_implicit_str_enum_conversion} @@ -45,16 +45,16 @@ void init_sycl(py::module_ &m, const py::exception &base_exce sycl_module.def("list_available_sycl_implementations", &plssvm::sycl::list_available_sycl_implementations, "list all available SYCL implementations"); - py::enum_ py_enum_invocation(sycl_module, "KernelInvocationType", "Enum class for all possible SYCL kernel invocation types supported in PLSSVM."); - py_enum_invocation - .value("AUTOMATIC", plssvm::sycl::kernel_invocation_type::automatic, "use the best kernel invocation type for the current SYCL implementation and target hardware platform") - .value("BASIC", plssvm::sycl::kernel_invocation_type::basic, "use the basic data parallel kernel invocation type") - .value("WORK_GROUP", plssvm::sycl::kernel_invocation_type::work_group, "use the work-group data parallel kernel invocation type") - .value("HIERARCHICAL", plssvm::sycl::kernel_invocation_type::hierarchical, "use the hierarchical data parallel kernel invocation type") - .value("SCOPED", plssvm::sycl::kernel_invocation_type::scoped, "use the AdaptiveCpp specific scoped parallelism kernel invocation type"); + py::enum_ py_enum_data_parallel_kernel(sycl_module, "DataParallelKernel", "Enum class for all possible SYCL data parallel kernels supported in PLSSVM."); + py_enum_data_parallel_kernel + .value("AUTOMATIC", plssvm::sycl::data_parallel_kernel::automatic, "use the best data parallel kernel for the current SYCL implementation and target hardware platform") + .value("BASIC", plssvm::sycl::data_parallel_kernel::basic, "use the basic data parallel kernel") + .value("WORK_GROUP", plssvm::sycl::data_parallel_kernel::work_group, "use the work-group data parallel kernel") + .value("HIERARCHICAL", plssvm::sycl::data_parallel_kernel::hierarchical, "use the hierarchical data parallel kernel") + .value("SCOPED", plssvm::sycl::data_parallel_kernel::scoped, "use the AdaptiveCpp specific scoped parallelism kernel"); // enable implicit conversion from string to enum - plssvm::bindings::python::util::register_implicit_str_enum_conversion(py_enum_invocation); + plssvm::bindings::python::util::register_implicit_str_enum_conversion(py_enum_data_parallel_kernel); // initialize SYCL binding classes #if defined(PLSSVM_SYCL_BACKEND_HAS_ADAPTIVECPP) diff --git a/bindings/Python/solver_types.cpp b/bindings/Python/solver_types.cpp index f8309fb4b..cb6ca843c 100644 --- a/bindings/Python/solver_types.cpp +++ b/bindings/Python/solver_types.cpp @@ -20,6 +20,7 @@ void init_solver_types(py::module_ &m) { py_enum .value("AUTOMATIC", plssvm::solver_type::automatic, "the default solver type; depends on the available device and system memory") .value("CG_EXPLICIT", plssvm::solver_type::cg_explicit, "explicitly assemble the kernel matrix on the device") + .value("CG_STREAMING", plssvm::solver_type::cg_streaming, "explicitly calculate the kernel matrix and fully store it on the host; realized using unified shared memory") .value("CG_IMPLICIT", plssvm::solver_type::cg_implicit, "implicitly calculate the kernel matrix entries in each CG iteration"); // enable implicit conversion from string to enum diff --git a/bindings/Python/svm/utility.hpp b/bindings/Python/svm/utility.hpp index 38019bf8b..5b3bb823a 100644 --- a/bindings/Python/svm/utility.hpp +++ b/bindings/Python/svm/utility.hpp @@ -13,14 +13,14 @@ #define PLSSVM_BINDINGS_PYTHON_SVM_UTILITY_HPP_ #pragma once -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space -#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type -#include "plssvm/csvm_factory.hpp" // plssvm::make_csvm -#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator -#include "plssvm/parameter.hpp" // plssvm::parameter, named arguments -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel +#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type +#include "plssvm/csvm_factory.hpp" // plssvm::make_csvm +#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator +#include "plssvm/parameter.hpp" // plssvm::parameter, named arguments +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "bindings/Python/utility.hpp" // plssvm::bindings::python::util::check_kwargs_for_correctness @@ -46,7 +46,7 @@ namespace plssvm::bindings::python::util { template [[nodiscard]] inline std::unique_ptr assemble_csvm(const plssvm::backend_type backend, const plssvm::target_platform target, const plssvm::parameter ¶ms, plssvm::mpi::communicator comm, const py::kwargs &optional_args) { // check keyword arguments - plssvm::bindings::python::util::check_kwargs_for_correctness(optional_args, { "foo", "sycl_implementation_type", "sycl_kernel_invocation_type", "kokkos_execution_space" }); + plssvm::bindings::python::util::check_kwargs_for_correctness(optional_args, { "foo", "sycl_implementation_type", "sycl_data_parallel_kernel", "kokkos_execution_space" }); if (backend == plssvm::backend_type::sycl) { // parse SYCL specific keyword arguments @@ -54,12 +54,12 @@ template if (optional_args.contains("sycl_implementation_type")) { impl_type = optional_args["sycl_implementation_type"].cast(); } - plssvm::sycl::kernel_invocation_type invocation_type = plssvm::sycl::kernel_invocation_type::automatic; - if (optional_args.contains("sycl_kernel_invocation_type")) { - invocation_type = optional_args["sycl_kernel_invocation_type"].cast(); + plssvm::sycl::data_parallel_kernel data_parallel_kernel_type = plssvm::sycl::data_parallel_kernel::automatic; + if (optional_args.contains("sycl_data_parallel_kernel")) { + data_parallel_kernel_type = optional_args["sycl_data_parallel_kernel"].cast(); } - return plssvm::make_csvm(backend, std::move(comm), target, params, plssvm::sycl_implementation_type = impl_type, plssvm::sycl_kernel_invocation_type = invocation_type); + return plssvm::make_csvm(backend, std::move(comm), target, params, plssvm::sycl_implementation_type = impl_type, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type); } else if (backend == plssvm::backend_type::kokkos) { // parse Kokkos specific keyword arguments plssvm::kokkos::execution_space space = plssvm::kokkos::execution_space::automatic; diff --git a/docs/plssvm-train.1.in b/docs/plssvm-train.1.in index deae26bc1..f5976c39c 100644 --- a/docs/plssvm-train.1.in +++ b/docs/plssvm-train.1.in @@ -48,7 +48,7 @@ the maximum number of CG iterations (default: #features) .TP .B -l, --solver arg -choose the solver: automatic|cg_explicit|cg_implicit (default: automatic) +choose the solver: automatic|cg_explicit|cg_streaming|cg_implicit (default: automatic) .TP .B -a, --classification arg diff --git a/include/plssvm/backends/CUDA/csvm.hpp b/include/plssvm/backends/CUDA/csvm.hpp index ec02c80c0..c45902c8e 100644 --- a/include/plssvm/backends/CUDA/csvm.hpp +++ b/include/plssvm/backends/CUDA/csvm.hpp @@ -22,6 +22,7 @@ #include "plssvm/detail/type_traits.hpp" // PLSSVM_REQUIRES, plssvm::detail::is_one_type_of #include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator #include "plssvm/parameter.hpp" // plssvm::parameter, plssvm::detail::has_only_parameter_named_args_v +#include "plssvm/solver_types.hpp" // plssvm::solver_type #include "plssvm/svm/csvc.hpp" // plssvm::csvc #include "plssvm/svm/csvm.hpp" // plssvm::detail::csvm_backend_exists #include "plssvm/svm/csvr.hpp" // plssvm::csvr @@ -109,7 +110,7 @@ class csvm : public ::plssvm::detail::gpu_csvm // std::size_t namespace plssvm::cuda::detail { @@ -22,8 +24,8 @@ namespace plssvm::cuda::detail { * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for! * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -32,78 +34,77 @@ namespace plssvm::cuda::detail { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> device_specific_num_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // determine on which side of the diagonal we are located - if (dim + threadIdx_y < global_j) { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y - global_j * (global_j + 1ull) / 2ull]; - } - // determine on which side of the diagonal we are located - if (dim + threadIdx.y + THREAD_BLOCK_SIZE < global_j) { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - global_j * (global_j + 1ull) / 2ull]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + // determine on which side of the diagonal we are located + if (dim_block + threadIdx_y < global_j_idx_linear) { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } - - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && device_global_j < device_specific_num_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -115,8 +116,8 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -125,68 +126,72 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm_mirror(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long num_mirror_rows, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y - 1ull) * (dim + threadIdx_y) / 2ull + device_specific_num_rows - (dim + threadIdx_y) + global_j]; - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - 1ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) / 2ull + device_specific_num_rows - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) + global_j]; - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + device_specific_num_rows + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && partial_global_j < num_mirror_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -200,27 +205,29 @@ __global__ void device_kernel_symm_mirror(const unsigned long long num_rows, con * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_add(const unsigned long long num_cols, real_type *lhs, const real_type *rhs, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j]; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -233,27 +240,29 @@ __global__ void device_kernel_inplace_matrix_add(const unsigned long long num_co * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_scale(const unsigned long long num_cols, real_type *lhs, const real_type scale, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] *= scale; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale; // SoA } } } diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh index 8a766b7db..70c9b4101 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh @@ -14,20 +14,22 @@ #pragma once #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh" // plssvm::cuda::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include // std::size_t + namespace plssvm::cuda::detail { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -37,80 +39,84 @@ namespace plssvm::cuda::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type *q, const real_type QA_cost, const real_type cost, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ull) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh index 62f24d6bf..186400757 100644 --- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh +++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh @@ -15,9 +15,11 @@ #include "plssvm/backends/CUDA/kernel/detail/atomics.cuh" // atomicAdd for double precision floating point numbers on older CUDA hardware #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh" // plssvm::cuda::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include // std::size_t + namespace plssvm::cuda::detail { /** @@ -26,10 +28,10 @@ namespace plssvm::cuda::detail { * @tparam Args the types of the parameters necessary for the specific kernel function * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -41,56 +43,64 @@ namespace plssvm::cuda::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const unsigned long long num_classes, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_i_cache = reinterpret_cast(cache_one); + auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -101,16 +111,18 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) { - temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost; } } else { @@ -120,42 +132,44 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; - __shared__ real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the shared memory + B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE] += - temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE]; + C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE] += + temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j + static_cast(internal); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x + THREAD_BLOCK_SIZE]); + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal); + + atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); // SoA } __syncthreads(); // wai until all threads updated C with their values } @@ -164,51 +178,55 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - if (global_i == global_j) { + // update the diagonal + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = real_type{ 0.0 }; } } } - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; + // store the values in the shared memory + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i + static_cast(internal); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal); + + atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); // SoA } __syncthreads(); // wait until all threads updated C with their values } diff --git a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh index 8003a51a3..7748c45c8 100644 --- a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh +++ b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh @@ -51,42 +51,17 @@ template <> /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] __device__ __forceinline__ real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh index 204d6bd97..d7ebf45a3 100644 --- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh +++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh @@ -15,166 +15,180 @@ #include "plssvm/backends/CUDA/kernel/detail/atomics.cuh" // atomicAdd for double precision floating point numbers on older CUDA hardware #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh" // plssvm::cuda::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include // std::size_t + namespace plssvm::cuda::detail { /** * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. - * @param[out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long device_specific_num_sv, const unsigned long long sv_offset, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (unsigned long long sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_features + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes - data_cache_feature[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ull) + sv + threadIdx_y]; // SoA - data_cache_alpha[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ull) + sv + sv_offset + threadIdx_y]; // AoS - } - __syncthreads(); // wait until all threads loaded their part of the data + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + // store the values in the shared memory + feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y]; // SoA + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y]; // AoS + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache[sv][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_feature_idx = feature_idx + static_cast(internal_feature); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d[global_feature_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_feature][internal_class]; + w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_w[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - data_cache_w[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - } - __syncthreads(); // wait until all threads loaded their part of the data + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]; + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx]; + prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx]; // AoS } } } /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -184,53 +198,55 @@ __global__ void device_kernel_predict_linear(real_type *prediction_d, const real * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; +__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto pp_cache = reinterpret_cast(cache_one); + auto sv_cache = reinterpret_cast(cache_two); + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_sv[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - data_cache_sv[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]); + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]); } } } @@ -239,55 +255,57 @@ __global__ void device_kernel_predict(real_type *prediction_d, const real_type * } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter...); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); } } { - // same shared memory size but with different dimensions - __shared__ real_type alpha_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto alpha_cache = reinterpret_cast(cache_one); + auto out_cache = reinterpret_cast(cache_two); - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const unsigned long long global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - alpha_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the shared memory + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors - if (blockIdx_y == 0ull) { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y]; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; + if (blockIdx_y == std::size_t{ 0 }) { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y]; } else { out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx + static_cast(internal); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); } __syncthreads(); // wait until all threads updated their part of the prediction } diff --git a/include/plssvm/backends/HIP/csvm.hpp b/include/plssvm/backends/HIP/csvm.hpp index e1f64e58e..86dd0af9b 100644 --- a/include/plssvm/backends/HIP/csvm.hpp +++ b/include/plssvm/backends/HIP/csvm.hpp @@ -109,7 +109,7 @@ class csvm : public ::plssvm::detail::gpu_csvm // std::size_t + namespace plssvm::hip::detail { /** @@ -25,8 +27,8 @@ namespace plssvm::hip::detail { * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for! * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -35,78 +37,77 @@ namespace plssvm::hip::detail { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> device_specific_num_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // determine on which side of the diagonal we are located - if (dim + threadIdx_y < global_j) { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y - global_j * (global_j + 1ull) / 2ull]; - } - // determine on which side of the diagonal we are located - if (dim + threadIdx.y + THREAD_BLOCK_SIZE < global_j) { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull + 1ull) / 2ull]; - } else { - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - global_j * (global_j + 1ull) / 2ull]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + // determine on which side of the diagonal we are located + if (dim_block + threadIdx_y < global_j_idx_linear) { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } - - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && device_global_j < device_specific_num_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -118,8 +119,8 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -128,68 +129,72 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_symm_mirror(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long num_mirror_rows, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y - 1ull) * (dim + threadIdx_y) / 2ull + device_specific_num_rows - (dim + threadIdx_y) + global_j]; - A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - 1ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) / 2ull + device_specific_num_rows - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) + global_j]; - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + device_specific_num_rows + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && partial_global_j < num_mirror_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -203,27 +208,29 @@ __global__ void device_kernel_symm_mirror(const unsigned long long num_rows, con * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_add(const unsigned long long num_cols, real_type *lhs, const real_type *rhs, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j]; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -236,27 +243,29 @@ __global__ void device_kernel_inplace_matrix_add(const unsigned long long num_co * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_inplace_matrix_scale(const unsigned long long num_cols, real_type *lhs, const real_type scale, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; // # num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; // # num_rhs + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] *= scale; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale; // SoA } } } diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp index 75a3cd9a5..308867d76 100644 --- a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp @@ -14,23 +14,25 @@ #pragma once #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp" // plssvm::hip::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -40,80 +42,84 @@ namespace plssvm::hip::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type *q, const real_type QA_cost, const real_type cost, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a wavefront must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - } - __syncthreads(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the feature reduction calculation + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ull) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp index 77820e35a..b2bee8d46 100644 --- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp @@ -14,12 +14,14 @@ #pragma once #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp" // plssvm::hip::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** @@ -28,10 +30,10 @@ namespace plssvm::hip::detail { * @tparam Args the types of the parameters necessary for the specific kernel function * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -43,56 +45,64 @@ namespace plssvm::hip::detail { * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const unsigned long long num_classes, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); +__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a wavefront must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_i_cache = reinterpret_cast(cache_one); + auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i]; - data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; - data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]); } } } @@ -103,16 +113,18 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) { - temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost; } } else { @@ -122,42 +134,44 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; - __shared__ real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; - C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the shared memory + B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 }; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE] += - temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE]; + C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE] += + temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j + static_cast(internal); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x + THREAD_BLOCK_SIZE]); + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal); + + atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]); // SoA } __syncthreads(); // wai until all threads updated C with their values } @@ -166,51 +180,55 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset + i + static_cast(internal_i); - const auto global_j = row_offset + j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - if (global_i == global_j) { + // update the diagonal + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = real_type{ 0.0 }; } } } - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y]; - B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; + // store the values in the shared memory + B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset + i + static_cast(internal); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal); + + atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); // SoA } __syncthreads(); // wait until all threads updated C with their values } diff --git a/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp b/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp index a98bb0715..1b2be0ae6 100644 --- a/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp @@ -51,42 +51,17 @@ template <> /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] __device__ __forceinline__ real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp index 6e349927e..9ee22edc4 100644 --- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp +++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp @@ -14,169 +14,183 @@ #pragma once #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp" // plssvm::hip::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include // std::size_t + namespace plssvm::hip::detail { /** * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. - * @param[out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long device_specific_num_sv, const unsigned long long sv_offset, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (unsigned long long sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_features + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes - data_cache_feature[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ull) + sv + threadIdx_y]; // SoA - data_cache_alpha[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ull) + sv + sv_offset + threadIdx_y]; // AoS - } - __syncthreads(); // wait until all threads loaded their part of the data + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + // store the values in the shared memory + feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y]; // SoA + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y]; // AoS + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache[sv][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_feature_idx = feature_idx + static_cast(internal_feature); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d[global_feature_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_feature][internal_class]; + w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__global__ void device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; +__global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_ull; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_w[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - data_cache_w[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_classes + PADDING_SIZE_ull) + global_class_idx]; - } - __syncthreads(); // wait until all threads loaded their part of the data + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into shared memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]; + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + __syncthreads(); // wait until all threads loaded their part of the data + + // perform the dot product calculation + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } + __syncthreads(); // wait until all threads performed their part of the calculations } - __syncthreads(); // wait until all threads performed their part of the calculations } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx]; + prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx]; // AoS } } } /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -186,53 +200,55 @@ __global__ void device_kernel_predict_linear(real_type *prediction_d, const real * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ template -__global__ void device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) { - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension - const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension - const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension - const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension - const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_ull = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_ull = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_ull = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_ull = static_cast(PADDING_SIZE); - - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; - const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x; +__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(threadIdx.x); // current thread in block x-dimension + const auto threadIdx_y = static_cast(threadIdx.y); // current thread in block y-dimension + const auto blockDim_x = static_cast(blockDim.x); // number of threads in block x-dimension + const auto blockDim_y = static_cast(blockDim.y); // number of threads in block y-dimension + const auto blockIdx_x = static_cast(blockIdx.x) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size is too large + const auto blockIdx_y = static_cast(blockIdx.y) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size is too large + + // create two shared memory arrays used for caching + __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // create the shared memory arrays used for caching data point features - __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto pp_cache = reinterpret_cast(cache_one); + auto sv_cache = reinterpret_cast(cache_two); + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx]; - data_cache_sv[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - data_cache_sv[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the shared memory + pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } __syncthreads(); // wait until all threads loaded their part of the data // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]); + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]); } } } @@ -241,55 +257,57 @@ __global__ void device_kernel_predict(real_type *prediction_d, const real_type * } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter...); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); } } { - // same shared memory size but with different dimensions - __shared__ real_type alpha_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __shared__ real_type out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto alpha_cache = reinterpret_cast(cache_one); + auto out_cache = reinterpret_cast(cache_two); - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into shared memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const unsigned long long global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; - alpha_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the shared memory + alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors - if (blockIdx_y == 0ull) { - out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y]; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ull]; + if (blockIdx_y == std::size_t{ 0 }) { + out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y]; } else { out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; - out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 }; } } __syncthreads(); // wait until all threads loaded their part of the data // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + threadIdx.x] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv]; } } __syncthreads(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx + static_cast(internal); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]); + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]); } __syncthreads(); // wait until all threads updated their part of the prediction } diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp index 20cbad247..579a56715 100644 --- a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp @@ -34,60 +34,63 @@ namespace plssvm::hpx::detail { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a symmetric matrix (memory optimized), @p B and @p C are matrices, and @p alpha and @p beta are scalars. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); +inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // define range over which should be iterated - std::vector range(blocked_num_rhs * blocked_device_specific_num_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector range(blocked_num_rhs * blocked_device_num_rows); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t rhs = idx / blocked_device_specific_num_rows; - const std::size_t row = idx % blocked_device_specific_num_rows; - - const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) { + // iterate over all values using blocking + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - real_type A_val = 0.0; - // determine on which side of the diagonal we are located - if (dim < global_row) { - A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_val = A[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }]; + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + real_type A_cache = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset); } - temp[internal_i][internal_j] += A_val * B(global_rhs, dim + row_offset); + temp[internal_j][internal_i] += sum; } } } @@ -95,13 +98,14 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t device_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } @@ -113,70 +117,74 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); const auto blocked_num_mirror_rows = static_cast(std::ceil(static_cast(num_mirror_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // define range over which should be iterated - std::vector range(blocked_num_rhs * blocked_num_mirror_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector range(blocked_num_rhs * blocked_num_mirror_rows); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t rhs = idx / blocked_num_mirror_rows; - const std::size_t row = idx % blocked_num_mirror_rows; - - const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) { + // iterate over the remaining values using blocking + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - const real_type A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row]; - temp[internal_i][internal_j] += A_val * B(global_rhs, row_offset + dim); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + sum += A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] * // SoA, upper triangular matrix only + B(global_i_idx, dim_block + dim + device_row_offset); // SoA + } + temp[internal_j][internal_i] += sum; } } } - // apply the (partial) BLAS operation and update C + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t partial_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp index e575c6af2..f1cf4723e 100644 --- a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -15,7 +15,7 @@ #pragma once #include "plssvm/backends/HPX/kernel/kernel_functions.hpp" // plssvm::hpx::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/matrix.hpp" // plssvm::aos_matrix @@ -32,82 +32,89 @@ namespace plssvm::hpx::detail { /** - * @brief Assemble the kernel matrix using the @p kernel function. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function + * @brief Assemble the kernel matrix using the @p kernel_function function. + * @tparam kernel_function the compile-time kernel function to use + * @tparam Args the types of the potential additional arguments for the @p kernel_function function * @param[out] kernel_matrix the resulting kernel matrix * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] q the `q` vector * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function */ -template -void device_kernel_assembly(std::vector &kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { +template +void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { + PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!"); PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!"); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); // calculate constants const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // count the number of entries in the final index list - std::vector indices(blocked_row_range * blocked_device_specific_num_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector indices(blocked_row_range * blocked_device_num_rows); std::iota(indices.begin(), indices.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, indices.cbegin(), indices.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { - // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + real_type temp_ij = temp[internal_j][internal_i]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } - kernel_matrix[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 06df89dac..7b8d79e1b 100644 --- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -34,25 +34,25 @@ namespace plssvm::hpx::detail { /** - * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @tparam kernel_function the compile-time kernel function to use + * @tparam Args the types of the potential additional arguments for the @p kernel_function function * @param[in] alpha the scalar alpha value * @param[in] q the `q` vector * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] B the matrix @p B * @param[in,out] C the matrix @p C - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function */ -template -inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { +template +inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!"); PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size()); @@ -61,64 +61,92 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); const std::size_t num_classes = B.num_rows(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // count the number of entries in the final index list - std::vector indices(blocked_row_range * blocked_device_specific_num_rows); // define range over which should be iterated + // define the range over which should be iterated + std::vector indices(blocked_row_range * blocked_device_num_rows); std::iota(indices.begin(), indices.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, indices.cbegin(), indices.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { - // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_j][internal_i] = detail::apply_kernel_function(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { - temp_ij += cost; - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { - atomic_ref{ C(class_idx, global_row) } += alpha * temp_ij * B(class_idx, global_row); + if (global_i_idx == global_j_idx) { + temp[internal_j][internal_i] += cost; + } + } else { + // be sure to set the value to zero otherwise + temp[internal_j][internal_i] = real_type{ 0.0 }; + } + } + } + + //*************************************************************************// + // calculate C += alpha * temp * B // + //*************************************************************************// + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + if (global_i_idx == global_j_idx) { + // only apply once to the diagonal + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + atomic_ref{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } else { - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { - atomic_ref{ C(class_idx, global_row) } += alpha * temp_ij * B(class_idx, global_col); + // apply it for the upper and lower triangular matrix + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + atomic_ref{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx); // symmetry - atomic_ref{ C(class_idx, global_col) } += alpha * temp_ij * B(class_idx, global_row); + atomic_ref{ C(class_block + class_idx, global_j_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } } diff --git a/include/plssvm/backends/HPX/kernel/kernel_functions.hpp b/include/plssvm/backends/HPX/kernel/kernel_functions.hpp index 6c0cd8a43..35e79d01d 100644 --- a/include/plssvm/backends/HPX/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/HPX/kernel/kernel_functions.hpp @@ -28,42 +28,17 @@ namespace plssvm::hpx::detail { /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] inline real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp index 7ea68e172..d5e811c63 100644 --- a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp @@ -16,7 +16,7 @@ #include "plssvm/backends/HPX/detail/utility.hpp" // plssvm::hpx::detail::atomic_ref #include "plssvm/backends/HPX/kernel/kernel_functions.hpp" // plssvm::hpx::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/matrix.hpp" // plssvm::aos_matrix, plssvm::soa_matrix @@ -38,59 +38,63 @@ namespace plssvm::hpx::detail { * @param[out] w the vector to speedup the linear prediction * @param[in] alpha the previously learned weights * @param[in] support_vectors the support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for */ -inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) { +inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) { PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() })); - PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows()); - PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); - const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = support_vectors.num_cols(); const auto blocked_num_features = static_cast(std::ceil(static_cast(num_features) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // define range over which should be iterated + // define the range over which should be iterated std::vector range(blocked_num_classes * blocked_num_features); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t feature = idx / blocked_num_classes; - const std::size_t c = idx % blocked_num_classes; - - const std::size_t feature_idx = feature * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz; + const std::size_t feature_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t sv = 0; sv < device_specific_num_sv; ++sv) { - // perform the feature reduction calculation + // iterate over all support vectors using blocking + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_feature_idx = feature_idx + static_cast(internal_feature); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - temp[internal_feature][internal_class] += alpha(global_class_idx, sv_offset + sv) * support_vectors(sv_offset + sv, global_feature_idx); + real_type sum{ 0.0 }; + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + sum += alpha(global_class_idx, device_sv_offset + sv_block + sv) * support_vectors(device_sv_offset + sv_block + sv, global_feature_idx); + } + temp[internal_class][internal_feature] += sum; } } } - // update global array with local one + // store the result back to the w vector for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_feature_idx = feature_idx + static_cast(internal_feature); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w(global_class_idx, global_feature_idx) = temp[internal_feature][internal_class]; + w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature]; } } }); @@ -102,63 +106,64 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) { +inline void device_kernel_predict_linear(aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) { PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size()); PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_classes = prediction.num_cols(); - const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // define range over which should be iterated - std::vector range(blocked_device_specific_num_predict_points * blocked_num_classes); + // define the range over which should be iterated + std::vector range(blocked_device_num_predict_points * blocked_num_classes); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t pp = idx / blocked_num_classes; - const std::size_t c = idx % blocked_num_classes; - - const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - // perform the feature reduction calculation + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - temp[internal_pp][internal_class] += w(global_class_idx, dim) * predict_points(global_pp_idx, dim); + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += w(global_class_idx, feature_block + feature) * predict_points(global_pp_idx, feature_block + feature); + } + temp[internal_class][internal_pp] += sum; } } } - // perform the dot product calculation + // update the global array with the local one for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - if (device_global_pp_idx < device_specific_num_predict_points && global_class_idx < num_classes) { - prediction(global_pp_idx, global_class_idx) = temp[internal_pp][internal_class] - rho[global_class_idx]; - } + prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx]; } } }); @@ -166,61 +171,63 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons /** * @brief Predict the @p predict_points_d using the @p kernel_function. - * @tparam kernel the type of the used kernel function + * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function * @param[out] prediction the predicted values * @param[in] alpha the previously learned weights * @param[in] rho the previously learned bias * @param[in] support_vectors the support vectors * @param[in] predict_points the data points to predict - * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for - * @param[in] row_offset the first row in @p predict_points the current device is responsible for + * @param[in] device_num_predict_points the number of predict points the current device is responsible for + * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ -template -inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) { +template +inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) { PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size()); PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_support_vectors = support_vectors.num_rows(); - const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); + const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - // define range over which should be iterated - std::vector range(blocked_device_specific_num_predict_points * blocked_num_support_vectors); + // define the range over which should be iterated + std::vector range(blocked_device_num_predict_points * blocked_num_support_vectors); std::iota(range.begin(), range.end(), 0); ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) { // calculate the indices used in the current thread - const std::size_t pp = idx / blocked_num_support_vectors; - const std::size_t sv = idx % blocked_num_support_vectors; - - const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = sv * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (idx / blocked_num_support_vectors) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t sv_idx = (idx % blocked_num_support_vectors) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - temp[internal_pp][internal_sv] += detail::feature_reduce(support_vectors(global_sv_idx, dim), - predict_points(global_pp_idx, dim)); + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature)); + } + temp[internal_sv][internal_pp] += sum; } } } @@ -228,25 +235,23 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m // update temp using the respective kernel function for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); + temp[internal_sv][internal_pp] = detail::apply_kernel_function(temp[internal_sv][internal_pp], kernel_function_parameter...); } } - // add results to prediction - for (std::size_t a = 0; a < num_classes; ++a) { + // atomically add the results to the prediction + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - // be sure to not perform out of bounds accesses - if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { if (global_sv_idx == 0) { - atomic_ref{ prediction(global_pp_idx, a) } += -rho[a]; + atomic_ref{ prediction(global_pp_idx, class_block + class_idx) } += -rho[class_block + class_idx]; } - atomic_ref{ prediction(global_pp_idx, a) } += - temp[internal_pp][internal_sv] * alpha(a, global_sv_idx); + atomic_ref{ prediction(global_pp_idx, class_block + class_idx) } += alpha(class_block + class_idx, global_sv_idx) * temp[internal_sv][internal_pp]; } } } diff --git a/include/plssvm/backends/Kokkos/csvm.hpp b/include/plssvm/backends/Kokkos/csvm.hpp index 5a77ef1e1..3098a0d87 100644 --- a/include/plssvm/backends/Kokkos/csvm.hpp +++ b/include/plssvm/backends/Kokkos/csvm.hpp @@ -139,7 +139,7 @@ class csvm : public ::plssvm::detail::gpu_csvm // std::array + +namespace plssvm::kokkos::detail { + +/** + * @brief List all available Kokkos::MemorySpaces at compile time. + * @details The `memory_space::host_space` is always available! + * @return a `std::array` containing all available memory spaces (`[[nodiscard]]`) + */ +[[nodiscard]] inline constexpr auto constexpr_available_memory_spaces() noexcept { + // Note: the trailing comma is explicitly allowed by the standard + return std::array{ + memory_space::host_space, +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_CUDA) + memory_space::cuda_space, + memory_space::cuda_usm_space, +#endif +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HIP) + memory_space::hip_space, + memory_space::hip_usm_space, +#endif +#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_SYCL) + memory_space::sycl_space, + memory_space::sycl_usm_space, +#endif + }; +} + +} // namespace plssvm::kokkos::detail + +#endif // PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_MEMORY_SPACES_HPP_ diff --git a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp index ad067d00b..19cc9cb60 100644 --- a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp +++ b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp @@ -34,6 +34,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr // std::array #include // std::size_t @@ -38,27 +39,27 @@ struct create_view_variant_type_helper; /** * @brief Helper struct to create a `std::variant` containing all available Kokkos::View types by iterating over the `std::array` of - * `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`. + * `plssvm::kokkos::memory_space` values as returned by `plssvm::kokkos::detail::constexpr_available_memory_spaces()`. * @tparam T the value type of the underlying Kokkos::View * @tparam Is the indices to index the `std::array` */ template struct create_view_variant_type_helper> { - /// The array containing all available execution spaces. - constexpr static auto array = detail::constexpr_available_execution_spaces(); + /// The array containing all available memory spaces. + constexpr static auto array = detail::constexpr_available_memory_spaces(); /// The resulting variant type. - using type = std::variant>...>; + using type = std::variant>...>; }; /** * @brief Create a `std::variant` containing all available Kokkos::View types by iterating over the `std::array` of - * `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`. + * `plssvm::kokkos::memory_space` values as returned by `plssvm::kokkos::detail::constexpr_available_memory_spaces()`. * @tparam T the value type of the underlying Kokkos::View */ template struct create_view_variant_type { /// The number of types in the final variant. - constexpr static std::size_t N = detail::constexpr_available_execution_spaces().size(); + constexpr static std::size_t N = detail::constexpr_available_memory_spaces().size(); /// The final variant type. using type = typename create_view_variant_type_helper>::type; }; @@ -82,37 +83,49 @@ class device_view_wrapper { /** * @brief Construct the wrapper using the provided Kokkos::View instance by forwarding its value to the underlying `std::variant`. - * @tparam ExecutionSpace the used Kokkos::ExecutionSpace type of the Kokkos::View + * @tparam MemorySpace the used Kokkos::MemorySpace type of the Kokkos::View * @param[in] view the Kokkos::View instance */ - template - explicit device_view_wrapper(Kokkos::View &&view) : - v_{ std::move(view) } { } + template + explicit device_view_wrapper(Kokkos::View &&view, const bool use_usm_allocations = false) : + v_{ std::move(view) }, + use_usm_allocations_{ use_usm_allocations } { } /** * @brief Given the provided `execution_space` enum value, tries to get the `std::variant` alternative for the corresponding Kokkos::ExecutionSpace type. * @tparam space the `execution_space` enum value + * @tparam use_usm_allocations if `true` use USM allocations * @return the Kokkos::View instance (`[[nodiscard]]`) */ - template - [[nodiscard]] Kokkos::View> &get() { - return std::get>>(v_); + template + [[nodiscard]] auto &get() { + constexpr memory_space mem_space = execution_space_to_memory_space_v; + return std::get>>(v_); } /** * @copydoc plssvm::kokkos::detail::device_view_wrapper::get */ - template - [[nodiscard]] const Kokkos::View> &get() const { - return std::get>>(v_); + template + [[nodiscard]] const auto &get() const { + constexpr memory_space mem_space = execution_space_to_memory_space_v; + return std::get>>(v_); } /** - * @brief Return the `execution_space` enum value of the currently active `std::variant` Kokkos::View type. - * @return the `execution_space` enum value (`[[nodiscard]]`) + * @brief Return the `memory_space` enum value of the currently active `std::variant` Kokkos::View type. + * @return the `memory_space` enum value (`[[nodiscard]]`) */ - [[nodiscard]] execution_space get_execution_space() const noexcept { - return detail::constexpr_available_execution_spaces()[v_.index()]; + [[nodiscard]] constexpr memory_space get_memory_space() const noexcept { + return detail::constexpr_available_memory_spaces()[v_.index()]; + } + + /** + * @brief Check whether USM allocations are used. + * @return `true` if USM allocations are used, `false` otherwise (`[[nodiscard]]`) + */ + [[nodiscard]] bool uses_usm_allocations() const noexcept { + return use_usm_allocations_; } /** @@ -164,6 +177,8 @@ class device_view_wrapper { private: /// The wrapped `std::variant` type. variant_type v_; + /// `true` if USM allocations and, therefore, other Kokkos::MemorySpaces, are used. + bool use_usm_allocations_; }; /** @@ -171,14 +186,20 @@ class device_view_wrapper { * @tparam T the value type of the underlying Kokkos::View * @param[in] device the device for which this view should be allocated * @param[in] size the size of the Kokkos::View (number of elements **not** byte!) - * @return a Kokkos::View wrapper where the active member of the internal `std::variant` corresponds to the Kokkos::View in the Kokkos::ExecutionSpace specified by @p space (`[[nodiscard]]`) + * @param[in] use_usm_allocations decide whether a USM memory space should be used or not + * @return a Kokkos::View wrapper where the active member of the internal `std::variant` corresponds to the Kokkos::View in the Kokkos::MemorySpace based on the requested Kokkos::ExecutionSpace and @p use_usm_allocations (`[[nodiscard]]`) */ template -[[nodiscard]] device_view_wrapper make_device_view_wrapper(const device_wrapper &device, const std::size_t size) { +[[nodiscard]] device_view_wrapper make_device_view_wrapper(const device_wrapper &device, const std::size_t size, const bool use_usm_allocations) { return device.execute_and_return([&](const auto &value) { + // get the Kokkos execution space using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t; - - return device_view_wrapper{ Kokkos::View{ Kokkos::view_alloc(value, "device_ptr_view"), size } }; + // check whether we want to use USM allocations or not + if (use_usm_allocations) { + return device_view_wrapper{ Kokkos::View>{ Kokkos::view_alloc(value, "usm_device_ptr_view"), size }, use_usm_allocations }; + } else { + return device_view_wrapper{ Kokkos::View>{ Kokkos::view_alloc(value, "device_ptr_view"), size }, use_usm_allocations }; + } }); } diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp index bddadac01..1cff7f721 100644 --- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp" // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents @@ -24,22 +26,29 @@ namespace plssvm::kokkos::detail { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + * @tparam USMEnabledMemorySpace the Kokkos::MemorySpace that may use USM allocations + * @tparam target the target platform */ -template +template class device_kernel_symm { + /** + * @brief The type of the used Kokkos::View that may use USM allocations. + */ + template + using usm_device_view_type = Kokkos::View; // possible USM allocations /** * @brief The type of the used Kokkos::View. */ template - using device_view_type = Kokkos::View; + using device_view_type = Kokkos::View>; // no USM allocations public: /** * @brief Initialize the Kokkos kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -49,11 +58,11 @@ class device_kernel_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] grid_size_x the size of the execution grid in x-dimension */ - device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type A, device_view_type B, const real_type beta, device_view_type C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, usm_device_view_type A, device_view_type B, const real_type beta, device_view_type C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -69,79 +78,96 @@ class device_kernel_symm { */ KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { - // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto team_rank_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE; + const auto team_rank_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE; - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - - // create the shared memory arrays used for caching data point features - constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; - real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size * sizeof(real_type))); - Kokkos::mdspan> A_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> B_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in team x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in team y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in team x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in team y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current team in league x-dimension + offsets if the league size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current team in league y-dimension + offsets if the league size is too large + + // create two scratchpad memory arrays used for caching + constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz; + real_type *scratchpad_ptr = static_cast(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type))); + Kokkos::mdspan> A_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + Kokkos::mdspan> B_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into scratchpad memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the scratchpad memory + // determine on which side of the diagonal we are located + if (dim_block + threadIdx_y < global_j_idx_linear) { + A_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = A_[(dim_block + threadIdx_y) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } - // determine on which side of the diagonal we are located - if (dim + threadIdx_y < global_j) { - A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + dim + threadIdx_y - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + B_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = B_[(dim_block + device_row_offset_ + threadIdx_y) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } - // determine on which side of the diagonal we are located - if (dim + threadIdx_y + THREAD_BLOCK_SIZE < global_j) { - A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz + std::size_t{ 1 }) / std::size_t{ 2 }]; + team.team_barrier(); // wait until all threads loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i); + } + temp[internal_i][internal_j] += sum; + } + } } else { - A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; - } - - B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(dim + row_offset_ + threadIdx_y) * (num_rhs_ + PADDING_SIZE_sz) + global_i]; - B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(dim + row_offset_ + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rhs_ + PADDING_SIZE_sz) + global_i]; - } - team.team_barrier(); // wait until all threads loaded their part of the data - - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i); + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i); + } + } } } + team.team_barrier(); // wait until all threads performed their part of the calculations } - team.team_barrier(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -151,10 +177,10 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; - device_view_type A_; + usm_device_view_type A_; device_view_type B_; const real_type beta_; device_view_type C_; @@ -168,14 +194,21 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + * @tparam USMEnabledMemorySpace the Kokkos::MemorySpace that may use USM allocations + * @tparam target the target platform */ -template +template class device_kernel_symm_mirror { + /** + * @brief The type of the used Kokkos::View that may use USM allocations. + */ + template + using usm_device_view_type = Kokkos::View; // possible USM allocations /** * @brief The type of the used Kokkos::View. */ template - using device_view_type = Kokkos::View; + using device_view_type = Kokkos::View>; // no USM allocations public: /** @@ -183,8 +216,8 @@ class device_kernel_symm_mirror { * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -194,12 +227,12 @@ class device_kernel_symm_mirror { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] grid_size_x the size of the execution grid in x-dimension */ - device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type A, device_view_type B, const real_type beta, device_view_type C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, usm_device_view_type A, device_view_type B, const real_type beta, device_view_type C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -215,69 +248,90 @@ class device_kernel_symm_mirror { */ KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { - // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto team_rank_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE; + const auto team_rank_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE; - // create the shared memory arrays used for caching data point features - constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; - real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size * sizeof(real_type))); - Kokkos::mdspan> A_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> B_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in team x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in team y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in team x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in team y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current team in league x-dimension + offsets if the league size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current team in league y-dimension + offsets if the league size is too large + + // create two shared memory arrays used for caching + constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz; + real_type *scratchpad_ptr = static_cast(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type))); + Kokkos::mdspan> A_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + Kokkos::mdspan> B_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_y) + global_j]; - A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz - std::size_t{ 1 }) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) + global_j]; - B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(row_offset_ + dim + threadIdx_y) * (num_rhs_ + PADDING_SIZE_sz) + global_i]; - B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(row_offset_ + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rhs_ + PADDING_SIZE_sz) + global_i]; - } - team.team_barrier(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i); + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into scratchpad memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the scratchpad memory + A_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = A_[(dim_block + threadIdx_y) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = B_[(device_row_offset_ + dim_block + threadIdx_y) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + team.team_barrier(); // wait until all threads loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i); + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i); + } + } } } + team.team_barrier(); // wait until all threads performed their part of the calculations } - team.team_barrier(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -288,10 +342,10 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; - device_view_type A_; + usm_device_view_type A_; device_view_type B_; const real_type beta_; device_view_type C_; @@ -311,7 +365,7 @@ class device_kernel_inplace_matrix_add { * @brief The type of the used Kokkos::View. */ template - using device_view_type = Kokkos::View; + using device_view_type = Kokkos::View>; // no USM allocations public: /** @@ -338,26 +392,28 @@ class device_kernel_inplace_matrix_add { KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - // Calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // num_rhs + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in team x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in team y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in team x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in team y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current team in league x-dimension + offsets if the league size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current team in league y-dimension + offsets if the league size is too large + + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j]; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -383,7 +439,7 @@ class device_kernel_inplace_matrix_scale { * @brief The type of the used Kokkos::View. */ template - using device_view_type = Kokkos::View; + using device_view_type = Kokkos::View>; // no USM allocations public: /** @@ -410,26 +466,28 @@ class device_kernel_inplace_matrix_scale { KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - // Calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // num_rows - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // num_rhs + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in team x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in team y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in team x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in team y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current team in league x-dimension + offsets if the league size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current team in league y-dimension + offsets if the league size is too large + + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] *= scale_; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } } diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp index 8e42e8b41..8daeb8a26 100644 --- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp" // plssvm::kokkos::detail::standard_layout_tuple #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp" // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp" // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents @@ -27,25 +29,32 @@ namespace plssvm::kokkos::detail { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + * @tparam USMEnabledMemorySpace the Kokkos::MemorySpace that may use USM allocations + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `standard_layout_tuple` */ -template +template class device_kernel_assembly { + /** + * @brief The type of the used Kokkos::View that may use USM allocations. + */ + template + using usm_device_view_type = Kokkos::View; // possible USM allocations /** * @brief The type of the used Kokkos::View. */ template - using device_view_type = Kokkos::View; + using device_view_type = Kokkos::View>; // no USM allocations public: /** * @brief Initialize the Kokkos kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -55,12 +64,12 @@ class device_kernel_assembly { * @param[in] grid_size_x the size of the execution grid in x-dimension * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(device_view_type kernel_matrix_d, device_view_type data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, device_view_type q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(usm_device_view_type kernel_matrix, device_view_type data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, device_view_type q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, @@ -77,81 +86,103 @@ class device_kernel_assembly { */ KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto team_rank_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE; + const auto team_rank_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - - // create the shared memory arrays used for caching data point features - constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; - real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); - Kokkos::mdspan> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - - // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in team x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in team y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in team x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in team y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current team in league x-dimension + offsets if the league size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current team in league y-dimension + offsets if the league size is too large + + // create two scratchpad memory arrays used for caching + constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz; + auto *scratchpad_ptr = static_cast(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type))); + Kokkos::mdspan> data_i_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + Kokkos::mdspan> data_j_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + + // only calculate the upper triangular matrix -> can't use team.team_rank() since all threads in a team must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; - data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; - data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; - data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; - } - team.team_barrier(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i), - data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j)); + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into scratchpad memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the scratchpad memory + data_i_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + } + team.team_barrier(); // wait until all threads loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i), + data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j)); + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i), + data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j)); + } + } } } + team.team_barrier(); // wait until all threads performed their part of the calculations } - team.team_barrier(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } @@ -160,11 +191,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - device_view_type kernel_matrix_d_; - device_view_type data_d_; + usm_device_view_type kernel_matrix_; + device_view_type data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; device_view_type q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index b22f69885..ad1c6536a 100644 --- a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -15,8 +15,10 @@ #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp" // plssvm::kokkos::detail::standard_layout_tuple #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp" // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp" // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add @@ -27,26 +29,27 @@ namespace plssvm::kokkos::detail { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { /** * @brief The type of the used Kokkos::View. */ template - using device_view_type = Kokkos::View; + using device_view_type = Kokkos::View>; // no USM allocations public: /** * @brief Initialize the Kokkos kernel function object. * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -58,13 +61,13 @@ class device_kernel_assembly_symm { * @param[in] grid_size_x the size of the execution grid in x-dimension * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(const real_type alpha, device_view_type q, device_view_type data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, device_view_type B, device_view_type C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : + device_kernel_assembly_symm(const real_type alpha, device_view_type q, device_view_type data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, device_view_type B, device_view_type C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -82,59 +85,81 @@ class device_kernel_assembly_symm { */ KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto team_rank_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE; + const auto team_rank_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in team x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in team y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in team x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in team y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current team in league x-dimension + offsets if the league size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current team in league y-dimension + offsets if the league size is too large // calculate the indices used in the current thread - const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; // # rhs -> num_rhs - const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; // # rows -> num_mirror_rows - const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows - // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further + // get the scratchpad memory pointer for later usage + constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz; + real_type *scratchpad_ptr = static_cast(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type))); + + // only calculate the upper triangular matrix -> can't use team.team_rank() since all threads in a team must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // create the shared memory arrays used for caching data point features - constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; - real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); - + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { - // create the shared memory arrays used for caching data point features - Kokkos::mdspan> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + // reinterpret the scratchpad memory to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + Kokkos::mdspan> data_i_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + Kokkos::mdspan> data_j_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into scratchpad memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; - data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i]; - data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; - data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the scratchpad memory + data_i_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } team.team_barrier(); // wait until all threads loaded their part of the data - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i), - data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j)); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i), + data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j)); + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i), + data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j)); + } } } } @@ -145,16 +170,18 @@ class device_kernel_assembly_symm { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if ((device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j)) { - temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost_; } } else { @@ -164,42 +191,44 @@ class device_kernel_assembly_symm { } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - Kokkos::mdspan> B_cache{ data_cache_ptr, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE, FEATURE_BLOCK_SIZE }; - Kokkos::mdspan> C_out_cache{ data_cache_ptr + shmem_size, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE, FEATURE_BLOCK_SIZE }; + // reinterpret the scratchpad memory to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + Kokkos::mdspan> B_cache{ scratchpad_ptr, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz, THREAD_BLOCK_SIZE_uz }; + Kokkos::mdspan> C_out_cache{ scratchpad_ptr + scratchpad_size, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz, THREAD_BLOCK_SIZE_uz }; // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { + // load data into scratchpad memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y) = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y]; - B_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y + THREAD_BLOCK_SIZE) = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz]; - C_out_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y) = real_type{ 0.0 }; - C_out_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y + THREAD_BLOCK_SIZE) = real_type{ 0.0 }; + // store the values in the scratchpad memory + B_cache(internal * THREAD_BLOCK_SIZE + team_rank_x, team_rank_y) = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA + C_out_cache(internal * THREAD_BLOCK_SIZE + team_rank_x, team_rank_y) = real_type{ 0.0 }; // SoA } team.team_barrier(); // wait until all threads loaded their part of the data - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + // calculate intermediate results and store them in scratchpad memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j, (class_idx + threadIdx_x) % FEATURE_BLOCK_SIZE) += - temp[internal_i][internal_j] * B_cache(threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i, (class_idx + threadIdx_x) % FEATURE_BLOCK_SIZE); + C_out_cache(team_rank_y * INTERNAL_BLOCK_SIZE + internal_j, (class_idx + team_rank_x) % THREAD_BLOCK_SIZE) += + temp[internal_i][internal_j] * B_cache(team_rank_x * INTERNAL_BLOCK_SIZE + internal_i, (class_idx + team_rank_x) % THREAD_BLOCK_SIZE); } } team.team_barrier(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j + static_cast(internal); - Kokkos::atomic_add(&C_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_x], C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal, threadIdx_x)); - Kokkos::atomic_add(&C_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_sz], C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal, threadIdx_x + THREAD_BLOCK_SIZE)); + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal); + + Kokkos::atomic_add(&C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache(team_rank_y * INTERNAL_BLOCK_SIZE + internal, team_rank_x)); // SoA } team.team_barrier(); // wai until all threads updated C with their values } @@ -208,51 +237,54 @@ class device_kernel_assembly_symm { // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = real_type{ 0.0 }; } } } - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // same shared memory size but with different dimensions - Kokkos::mdspan> B_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> C_out_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + // reinterpret the scratchpad memory to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + Kokkos::mdspan> B_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + Kokkos::mdspan> C_out_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { + // load data into scratchpad memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y]; - B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz]; - C_out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 }; - C_out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 }; + // store the values in the scratchpad memory + B_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA + C_out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = real_type{ 0.0 }; } team.team_barrier(); // wait until all threads loaded their part of the data - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + // calculate intermediate results and store them in scratchpad memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, internal_i * THREAD_BLOCK_SIZE + threadIdx_x) += - temp[internal_i][internal_j] * B_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j); + C_out_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, internal_i * THREAD_BLOCK_SIZE + team_rank_x) += + temp[internal_i][internal_j] * B_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j); } } team.team_barrier(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i + static_cast(internal); - Kokkos::atomic_add(&C_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y], C_out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x)); - Kokkos::atomic_add(&C_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz], C_out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x)); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal); + + Kokkos::atomic_add(&C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x)); // SoA } team.team_barrier(); // wait until all threads updated C with their values } @@ -264,10 +296,10 @@ class device_kernel_assembly_symm { /// @cond Doxygen_suppress const real_type alpha_; device_view_type q_; - device_view_type data_d_; + device_view_type data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp index a2859a294..652aaa25c 100644 --- a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp @@ -45,42 +45,17 @@ template /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] KOKKOS_INLINE_FUNCTION real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp index 767bfc958..51e67a89e 100644 --- a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp @@ -13,9 +13,11 @@ #define PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_ #pragma once -#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp" // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} -#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp" // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function} +#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp" // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add @@ -24,39 +26,40 @@ namespace plssvm::kokkos::detail { /** - * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. + * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + * @tparam target the target platform */ -template +template class device_kernel_w_linear { /** * @brief The type of the used Kokkos::View. */ template - using device_view_type = Kokkos::View; + using device_view_type = Kokkos::View>; // no USM allocations public: /** * @brief Initialize the Kokkos kernel function object. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] grid_size_x the size of the execution grid in x-dimension */ - device_kernel_w_linear(device_view_type w_d, device_view_type alpha_d, device_view_type sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(device_view_type w, device_view_type alpha, device_view_type support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, grid_size_x_{ grid_size_x } { } @@ -67,75 +70,100 @@ class device_kernel_w_linear { */ KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { - // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; - const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto team_rank_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE; + const auto team_rank_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE; - // create the shared memory arrays used for caching data point features - constexpr std::size_t shmem_size = THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; - real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); - Kokkos::mdspan> data_cache_feature{ data_cache_ptr, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> data_cache_alpha{ data_cache_ptr + shmem_size, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in team x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in team y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in team x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in team y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current team in league x-dimension + offsets if the league size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current team in league y-dimension + offsets if the league size is too large + + // create two scratchpad memory arrays used for caching + constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz; + real_type *scratchpad_ptr = static_cast(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type))); + Kokkos::mdspan> feature_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + Kokkos::mdspan> alpha_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE_sz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_features + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes - data_cache_feature(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_sz) + sv + threadIdx_y]; // SoA - data_cache_alpha(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_sz) + sv + sv_offset_ + threadIdx_y]; // AoS - } - team.team_barrier(); // wait until all threads loaded their part of the data + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) { + // load data into scratchpad memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_class) * data_cache_feature(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_feature); + // store the values in the scratchpad memory + feature_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_y]; // SoA + alpha_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_y]; // AoS + } + team.team_barrier(); // wait until all threads loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + real_type sum{ 0.0 }; + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache(sv, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * feature_cache(sv, team_rank_x * INTERNAL_BLOCK_SIZE + internal_feature); + } + temp[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache(sv, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * feature_cache(sv, team_rank_x * INTERNAL_BLOCK_SIZE + internal_feature); + } + } } } + team.team_barrier(); // wait until all threads performed their part of the calculations } - team.team_barrier(); // wait until all threads performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data const auto global_feature_idx = feature_idx + static_cast(internal_feature); const auto global_class_idx = class_idx + static_cast(internal_class); - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_sz) + global_class_idx] = temp[internal_feature][internal_class]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } private: /// @cond Doxygen_suppress - device_view_type w_d_; - device_view_type alpha_d_; - device_view_type sv_d_; + device_view_type w_; + device_view_type alpha_; + device_view_type support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; const std::size_t grid_size_x_; @@ -143,24 +171,25 @@ class device_kernel_w_linear { }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + * @tparam target the target platform */ -template +template class device_kernel_predict_linear { /** * @brief The type of the used Kokkos::View. */ template - using device_view_type = Kokkos::View; + using device_view_type = Kokkos::View>; // no USM allocations public: /** * @brief Initialize the Kokkos kernel function object. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point @@ -168,11 +197,11 @@ class device_kernel_predict_linear { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] grid_size_x the size of the execution grid in x-dimension */ - device_kernel_predict_linear(device_view_type prediction_d, device_view_type w_d, device_view_type rho_d, device_view_type predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(device_view_type prediction, device_view_type w, device_view_type rho, device_view_type predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) : + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -186,76 +215,97 @@ class device_kernel_predict_linear { */ KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { - // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto team_rank_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE; + const auto team_rank_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE; - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz; - const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - - // create the shared memory arrays used for caching data point features - constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; - real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); - Kokkos::mdspan> data_cache_pp{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> data_cache_w{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in team x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in team y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in team x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in team y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current team in league x-dimension + offsets if the league size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current team in league y-dimension + offsets if the league size is too large + + // create two scratchpad memory arrays used for caching + constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz; + real_type *scratchpad_ptr = static_cast(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type))); + Kokkos::mdspan> pp_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + Kokkos::mdspan> w_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_sz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx]; - data_cache_pp(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx]; - data_cache_w(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = w_d_[(dim + threadIdx_y) * (num_classes_ + PADDING_SIZE_sz) + global_class_idx]; - data_cache_w(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = w_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_classes_ + PADDING_SIZE_sz) + global_class_idx]; - } - team.team_barrier(); // wait until all threads loaded their part of the data + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_class) * data_cache_pp(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_pd); + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into scratchpad memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the scratchpad memory + pp_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = predict_points_[(feature_block + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = w_[(feature_block + threadIdx_y) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + team.team_barrier(); // wait until all threads loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp); + } + temp[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp); + } + } } } + team.team_barrier(); // wait until all threads performed their part of the calculations } - team.team_barrier(); // wait until all threads performed their part of the calculations } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } } private: /// @cond Doxygen_suppress - device_view_type prediction_d_; - device_view_type w_d_; - device_view_type rho_d_; - device_view_type predict_points_d_; + device_view_type prediction_; + device_view_type w_; + device_view_type rho_; + device_view_type predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -266,27 +316,28 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_predict { /** * @brief The type of the used Kokkos::View. */ template - using device_view_type = Kokkos::View; + using device_view_type = Kokkos::View>; // no USM allocations public: /** * @brief Initialize the SYCL kernel function object. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -296,12 +347,12 @@ class device_kernel_predict { * @param[in] grid_size_x the size of the execution grid in x-dimension * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(device_view_type prediction_d, device_view_type alpha_d, device_view_type rho_d, device_view_type sv_d, device_view_type predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(device_view_type prediction, device_view_type alpha, device_view_type rho, device_view_type support_vectors, device_view_type predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) : + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, @@ -317,55 +368,72 @@ class device_kernel_predict { */ KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type &team) const { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto team_rank_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE; + const auto team_rank_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_sz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_sz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_sz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_sz = static_cast(PADDING_SIZE); - const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_sz; // current thread in block x-dimension - const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_sz; // current thread in block y-dimension - const auto blockDim_x = THREAD_BLOCK_SIZE_sz; // number of threads in block x-dimension - const auto blockDim_y = THREAD_BLOCK_SIZE_sz; // number of threads in block y-dimension - const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current thread - const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz; - const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; - const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x; + const auto threadIdx_x = static_cast(team.team_rank()) / THREAD_BLOCK_SIZE_uz; // current thread in team x-dimension + const auto threadIdx_y = static_cast(team.team_rank()) % THREAD_BLOCK_SIZE_uz; // current thread in team y-dimension + const auto blockDim_x = THREAD_BLOCK_SIZE_uz; // number of threads in team x-dimension + const auto blockDim_y = THREAD_BLOCK_SIZE_uz; // number of threads in team y-dimension + const auto blockIdx_x = static_cast(team.league_rank()) % grid_size_x_ + grid_x_offset_; // current team in league x-dimension + offsets if the league size is too large + const auto blockIdx_y = static_cast(team.league_rank()) / grid_size_x_ + grid_y_offset_; // current team in league y-dimension + offsets if the league size is too large - constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE; - real_type *data_cache_ptr = static_cast(team.team_shmem().get_shmem(2 * shmem_size)); + // get the scratchpad memory pointer for later usage + constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz; + real_type *scratchpad_ptr = static_cast(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type))); // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { - // create the shared memory arrays used for caching data point features - Kokkos::mdspan> data_cache_pp{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> data_cache_sv{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; + // reinterpret the scratchpad memory to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + Kokkos::mdspan> pp_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + Kokkos::mdspan> sv_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into scratchpad memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx]; - data_cache_pp(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx]; - data_cache_sv(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[(dim + threadIdx_y) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx]; - data_cache_sv(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the scratchpad memory + pp_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = predict_points_[(feature_block + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = support_vectors_[(feature_block + threadIdx_y) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } team.team_barrier(); // wait until all threads loaded their part of the data - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_sv), - data_cache_pp(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_pd)); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(sv_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_sv), + pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp)); + } + temp[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_sv), + pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp)); + } } } } @@ -374,55 +442,57 @@ class device_kernel_predict { } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter_); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter_); } } { - // create the shared memory arrays used for caching data point features - Kokkos::mdspan> alpha_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - Kokkos::mdspan> out_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE }; - - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) { - // load data into shared memory + // reinterpret the scratchpad memory to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + Kokkos::mdspan> alpha_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + Kokkos::mdspan> out_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz }; + + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { + // load data into scratchpad memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - alpha_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx]; - alpha_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the scratchpad memory + alpha_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = alpha_[(class_block + threadIdx_y) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_y == std::size_t{ 0 }) { - out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y]; - out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y + THREAD_BLOCK_SIZE_sz]; + out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = -rho_[class_block + threadIdx_y]; } else { - out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 }; - out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 }; + out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = real_type{ 0.0 }; } } team.team_barrier(); // wait until all threads loaded their part of the data - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate intermediate results and store them in scratchpad memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, internal_pd * THREAD_BLOCK_SIZE + threadIdx_x) += - temp[internal_pd][internal_sv] * alpha_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_sv); + out_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, internal_pp * THREAD_BLOCK_SIZE + team_rank_x) += + temp[internal_pp][internal_sv] * alpha_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, team_rank_y * INTERNAL_BLOCK_SIZE + internal_sv); } } team.team_barrier(); // wait until all threads performed their part of the calculations } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data const auto global_pp_idx = pp_idx + static_cast(internal); - Kokkos::atomic_add(&prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y], out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x)); - Kokkos::atomic_add(&prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz], out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x)); + Kokkos::atomic_add(&prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x)); } team.team_barrier(); // wait until all threads updated their part of the prediction } @@ -431,11 +501,11 @@ class device_kernel_predict { private: /// @cond Doxygen_suppress - device_view_type prediction_d_; - device_view_type alpha_d_; - device_view_type rho_d_; - device_view_type sv_d_; - device_view_type predict_points_d_; + device_view_type prediction_; + device_view_type alpha_; + device_view_type rho_; + device_view_type support_vectors_; + device_view_type predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/Kokkos/memory_space.hpp b/include/plssvm/backends/Kokkos/memory_space.hpp new file mode 100644 index 000000000..eba6e1674 --- /dev/null +++ b/include/plssvm/backends/Kokkos/memory_space.hpp @@ -0,0 +1,77 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Memory space enumeration for the MemorySpaces in Kokkos. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_HPP_ +#pragma once + +#include "fmt/base.h" // fmt::formatter +#include "fmt/ostream.h" // fmt::ostream_formatter + +#include // std::ostream forward declaration +#include // std::vector + +namespace plssvm::kokkos { + +/** + * @brief Enum class for all memory spaces supported by [Kokkos](https://github.com/kokkos/kokkos). + */ +enum class memory_space { + /** Memory space representing traditional memory accessible from the CPU. */ + host_space, + /** Memory space representing memory on a CUDA-capable GPU. */ + cuda_space, + /** Memory space representing unified virtual memory on a CUDA-capable GPU system. */ + cuda_usm_space, + /** Memory space representing memory in the HIP GPU programming environment. */ + hip_space, + /** Memory space representing page-migrating memory in the HIP GPU programming environment. */ + hip_usm_space, + /** Memory space representing device memory in the SYCL GPU programming environment. */ + sycl_space, + /** Memory space representing page-migrating memory in the SYCL GPU programming environment */ + sycl_usm_space +}; + +/** + * @brief Output the memory @p space to the given output-stream @p out. + * @param[in,out] out the output-stream to write the memory space to + * @param[in] space the Kokkos memory space + * @return the output-stream + */ +std::ostream &operator<<(std::ostream &out, memory_space space); + +/** + * @brief Use the input-stream @p in to initialize the memory @p space. + * @param[in,out] in input-stream to extract the memory space from + * @param[in] space the Kokkos memory space + * @return the input-stream + */ +std::istream &operator>>(std::istream &in, memory_space &space); + +/** + * @brief List all available Kokkos::MemorySpaces. + * @details Only Kokkos::MemorySpaces that where enabled during the CMake configuration are available. + * The `memory_space::host_space` is always included. + * @return the available Kokkos::MemorySpaces (`[[nodiscard]]`) + */ +[[nodiscard]] std::vector list_available_memory_spaces(); + +} // namespace plssvm::kokkos + +/// @cond + +template <> +struct fmt::formatter : fmt::ostream_formatter { }; + +/// @endcond + +#endif // PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_HPP_ diff --git a/include/plssvm/backends/Kokkos/memory_space_type_traits.hpp b/include/plssvm/backends/Kokkos/memory_space_type_traits.hpp new file mode 100644 index 000000000..a25a846ab --- /dev/null +++ b/include/plssvm/backends/Kokkos/memory_space_type_traits.hpp @@ -0,0 +1,265 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Memory space type traits for the MemorySpaces in Kokkos. + */ + +#ifndef PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_TYPE_TRAITS_HPP_ +#define PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_TYPE_TRAITS_HPP_ +#include +#pragma once + +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp" // plssvm::kokkos::kokkos_type_to_execution_space_v +#include "plssvm/backends/Kokkos/memory_space.hpp" // plssvm::kokkos::memory_space + +#include "Kokkos_Core.hpp" // Kokkos macros, Kokkos MemorySpace types + +namespace plssvm::kokkos { + +//***************************************************// +// memory_space_to_kokkos_type // +//***************************************************// + +/** + * @brief Uninstantiated base type to convert a `memory_space` enum value to a Kokkos::MemorySpace type. + */ +template +struct memory_space_to_kokkos_type; + +/** + * @brief Convert a `memory_space::host_space` enum value to a `Kokkos::HostSpace` Kokkos::MemorySpace type. + */ +template <> +struct memory_space_to_kokkos_type { + using type = Kokkos::HostSpace; +}; + +#if defined(KOKKOS_ENABLE_CUDA) +/** + * @brief Convert a `memory_space::cuda_space` enum value to a `Kokkos::CudaSpace` Kokkos::MemorySpace type. + */ +template <> +struct memory_space_to_kokkos_type { + using type = Kokkos::CudaSpace; +}; + +/** + * @brief Convert a `memory_space::cuda_usm_space` enum value to a `Kokkos::CudaUVMSpace` Kokkos::MemorySpace type. + */ +template <> +struct memory_space_to_kokkos_type { + using type = Kokkos::CudaUVMSpace; +}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +/** + * @brief Convert a `memory_space::hip_space` enum value to a `Kokkos::HIPSpace` Kokkos::MemorySpace type. + */ +template <> +struct memory_space_to_kokkos_type { + using type = Kokkos::HIPSpace; +}; + +/** + * @brief Convert a `memory_space::hip_usm_space` enum value to a `Kokkos::HIPManagedSpace` Kokkos::MemorySpace type. + */ +template <> +struct memory_space_to_kokkos_type { + using type = Kokkos::HIPManagedSpace; +}; +#endif + +#if defined(KOKKOS_ENABLE_SYCL) +/** + * @brief Convert a `memory_space::sycl_space` enum value to a `Kokkos::SYCLDeviceUSMSpace` Kokkos::MemorySpace type. + */ +template <> +struct memory_space_to_kokkos_type { + using type = Kokkos::SYCLDeviceUSMSpace; +}; + +/** + * @brief Convert a `memory_space::sycl_usm_space` enum value to a `Kokkos::SYCLSharedUSMSpace` Kokkos::MemorySpace type. + */ +template <> +struct memory_space_to_kokkos_type { + using type = Kokkos::SYCLSharedUSMSpace; +}; +#endif + +/** + * @brief Convert the `memory_space` @p space to the corresponding Kokkos::MemorySpace type. + * @tparam space the enum value to convert + */ +template +using memory_space_to_kokkos_type_t = typename memory_space_to_kokkos_type::type; + +//***************************************************// +// kokkos_type_to_memory_space // +//***************************************************// + +/** + * @brief Uninstantiated base type to convert a Kokkos::MemorySpace type to a `memory_space` enum value. + */ +template +struct kokkos_type_to_memory_space; + +/** + * @brief Convert a `Kokkos::HostSpace` Kokkos::MemorySpace type to a `memory_space::host_space` enum value. + */ +template <> +struct kokkos_type_to_memory_space { + constexpr static memory_space value = memory_space::host_space; +}; + +#if defined(KOKKOS_ENABLE_CUDA) +/** + * @brief Convert a `Kokkos::CudaSpace` Kokkos::MemorySpace type to a `memory_space::cuda_space` enum value. + */ +template <> +struct kokkos_type_to_memory_space { + constexpr static memory_space value = memory_space::cuda_space; +}; + +/** + * @brief Convert a `Kokkos::CudaUVMSpace` Kokkos::MemorySpace type to a `memory_space::cuda_usm_space` enum value. + */ +template <> +struct kokkos_type_to_memory_space { + constexpr static memory_space value = memory_space::cuda_usm_space; +}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +/** + * @brief Convert a `Kokkos::HIPSpace` Kokkos::MemorySpace type to a `memory_space::hip_space` enum value. + */ +template <> +struct kokkos_type_to_memory_space { + constexpr static memory_space value = memory_space::hip_space; +}; + +/** + * @brief Convert a `Kokkos::HIPManagedSpace` Kokkos::MemorySpace type to a `memory_space::hip_usm_space` enum value. + */ +template <> +struct kokkos_type_to_memory_space { + constexpr static memory_space value = memory_space::hip_usm_space; +}; +#endif + +#if defined(KOKKOS_ENABLE_SYCL) +/** + * @brief Convert a `Kokkos::SYCLDeviceUSMSpace` Kokkos::MemorySpace type to a `memory_space::sycl_space` enum value. + */ +template <> +struct kokkos_type_to_memory_space { + constexpr static memory_space value = memory_space::sycl_space; +}; + +/** + * @brief Convert a `Kokkos::SYCLSharedUSMSpace` Kokkos::MemorySpace type to a `memory_space::sycl_usm_space` enum value. + */ +template <> +struct kokkos_type_to_memory_space { + constexpr static memory_space value = memory_space::sycl_usm_space; +}; +#endif + +/** + * @brief Convert the Kokkos::MemorySpace type @p MemorySpace to the corresponding `memory_space` enum value. + * @tparam MemorySpace the Kokkos::MemorySpace type to convert + */ +template +inline constexpr memory_space kokkos_type_to_memory_space_v = kokkos_type_to_memory_space::value; + +//***************************************************// +// execution_space_to_memory_space // +//***************************************************// + +/** + * @brief Convert a host `execution_space` enum value to a `memory_space::host_space` enum value. + */ +template +struct execution_space_to_memory_space { + constexpr static memory_space value = memory_space::host_space; +}; + +/** + * @brief Convert an `execution_space::cuda` that does not use USM allocations enum value to a `memory_space::cuda_space` enum value. + */ +template <> +struct execution_space_to_memory_space { + constexpr static memory_space value = memory_space::cuda_space; +}; + +/** + * @brief Convert an `execution_space::cuda` that does use USM allocations enum value to a `memory_space::cuda_usm_space` enum value. + */ +template <> +struct execution_space_to_memory_space { + constexpr static memory_space value = memory_space::cuda_usm_space; +}; + +/** + * @brief Convert an `execution_space::hip` that does not use USM allocations enum value to a `memory_space::hip_space` enum value. + */ +template <> +struct execution_space_to_memory_space { + constexpr static memory_space value = memory_space::hip_space; +}; + +/** + * @brief Convert an `execution_space::hip` that does use USM allocations enum value to a `memory_space::hip_usm_space` enum value. + */ +template <> +struct execution_space_to_memory_space { + constexpr static memory_space value = memory_space::hip_usm_space; +}; + +/** + * @brief Convert an `execution_space::sycl` that does not use USM allocations enum value to a `memory_space::sycl_space` enum value. + */ +template <> +struct execution_space_to_memory_space { + constexpr static memory_space value = memory_space::sycl_space; +}; + +/** + * @brief Convert an `execution_space::sycl` that does use USM allocations enum value to a `memory_space::sycl_usm_space` enum value. + */ +template <> +struct execution_space_to_memory_space { + constexpr static memory_space value = memory_space::sycl_usm_space; +}; + +/** + * @brief Convert the `execution_space` enum value @p space together with the @p UseUSM flag indication whether USM allocation should be used to the corresponding `memory_space` enum value. + * @tparam space the `execution_space` enum value to convert + * @tparam UseUSM `true` if USM allocations should be used + */ +template +inline constexpr memory_space execution_space_to_memory_space_v = execution_space_to_memory_space::value; + +//***************************************************// +// kokkos_execution_space_to_kokkos_memory_space // +//***************************************************// + +/** + * @brief Convert the Kokkos::ExecutionSpace type together with the @p UseUSM flag indication whether USM allocation should be used to the corresponding Kokkos::MemorySpace type. + * @tparam ExecutionSpace the Kokkos::ExecutionSpace type + * @tparam UseUSM `true` if USM allocations should be used + */ +template +using kokkos_execution_space_to_kokkos_memory_space_t = memory_space_to_kokkos_type_t, UseUSM>>; + +} // namespace plssvm::kokkos + +#endif // PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_TYPE_TRAITS_HPP_ diff --git a/include/plssvm/backends/OpenCL/csvm.hpp b/include/plssvm/backends/OpenCL/csvm.hpp index f52ec29cd..1ac955019 100644 --- a/include/plssvm/backends/OpenCL/csvm.hpp +++ b/include/plssvm/backends/OpenCL/csvm.hpp @@ -112,7 +112,7 @@ class csvm : public ::plssvm::detail::gpu_csvm // std::size_t +#include // std::variant namespace plssvm::opencl::detail { @@ -28,13 +29,14 @@ namespace plssvm::opencl::detail { * @tparam T the type of the kernel pointer to wrap */ template -class device_ptr : public ::plssvm::detail::gpu_device_ptr> { +class device_ptr : public ::plssvm::detail::gpu_device_ptr, device_ptr> { /// The template base type of the OpenCL device_ptr class. - using base_type = ::plssvm::detail::gpu_device_ptr>; + using base_type = ::plssvm::detail::gpu_device_ptr, device_ptr>; using base_type::data_; using base_type::queue_; using base_type::shape_; + using base_type::use_usm_allocations_; public: // Be able to use overloaded base class functions. @@ -60,21 +62,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr // std::string #include // std::string_view #include // std::forward, std::pair +#include // std::variant, std::visit #include // std::vector /** @@ -126,11 +129,12 @@ void device_synchronize(const command_queue &queue); * * @param[in] comm the MPI communicator * @param[in] contexts the used OpenCL contexts + * @param[in] target the target platform to create the kernel binaries for * @param[in] kernel_function the kernel function * @throws plssvm::invalid_file_format_exception if the file couldn't be read using [`std::ifstream::read`](https://en.cppreference.com/w/cpp/io/basic_istream/read) * @return [the command queues with all necessary kernels; information regarding the JIT compilation] (`[[nodiscard]]`) */ -[[nodiscard]] std::pair, jit_info> create_command_queues(const mpi::communicator &comm, const std::vector &contexts, kernel_function_type kernel_function); +[[nodiscard]] std::pair, jit_info> create_command_queues(const mpi::communicator &comm, const std::vector &contexts, target_platform target, kernel_function_type kernel_function); /** * @brief Set all arguments in the parameter pack @p args for the kernel @p kernel. @@ -143,7 +147,17 @@ inline void set_kernel_args(cl_kernel kernel, Args... args) { cl_uint i = 0; // iterate over parameter pack and set OpenCL kernel ([&](auto &arg) { - const error_code ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg); + error_code ec{}; + // check if we have to set a variant value + if constexpr (::plssvm::detail::is_variant_v<::plssvm::detail::remove_cvref_t>) { + std::visit(::plssvm::detail::visit_overload{ + [&](cl_mem &kernel_arg) { ec = clSetKernelArg(kernel, i++, sizeof(decltype(kernel_arg)), &kernel_arg); }, + [&](auto &kernel_arg) { ec = clSetKernelArgSVMPointer(kernel, i++, kernel_arg); } }, + arg); + } else { + // set kernel argument normally + ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg); + } PLSSVM_OPENCL_ERROR_CHECK(ec, fmt::format("error setting OpenCL kernel argument {}", i - 1)) }(args), ...); diff --git a/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl b/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl index 0f1ac247a..8b474df77 100644 --- a/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl +++ b/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl @@ -16,8 +16,8 @@ * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for! * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -26,78 +26,90 @@ * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, const ulong device_specific_num_rows, const ulong row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) { +__kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, const ulong device_num_rows, const ulong device_row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) { // cast values to 32-bit unsigned int values to prevent implicit conversions const uint local_id_0 = get_local_id(0); const uint local_id_1 = get_local_id(1); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large - // calculate the indices used in the current work-item - const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; // #rhs - const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; // # row - const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - - // create the local memory arrays used for caching data point features - __local real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __local real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create two local memory arrays used for caching + __local real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - // create a thread private array used for internal caching + // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 }; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ul) { - // load data into local memory - for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_i = i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - const ulong global_j = j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - - // determine on which side of the diagonal we are located - if (dim + get_local_id(1) < global_j) { - A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ul) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + (ulong) 1) / (ulong) 2]; - } else { - A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ul) + dim + threadIdx_y - global_j * (global_j + (ulong) 1) / (ulong) 2]; - } - // determine on which side of the diagonal we are located - if (dim + get_local_id(1) + THREAD_BLOCK_SIZE < global_j) { - A_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows - row_offset + PADDING_SIZE_ul) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ul + (ulong) 1) / (ulong) 2]; - } else { - A_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul - global_j * (global_j + (ulong) 1) / (ulong) 2]; - } + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows - B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ul) + global_i]; - B_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rhs + PADDING_SIZE_ul) + global_i]; - } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (ulong dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const ulong global_i_idx_linear = i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + const ulong global_j_idx_linear = j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + // determine on which side of the diagonal we are located + if (dim_block + get_local_id(1) < global_j_idx_linear) { + A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + (ulong) 1) / (ulong) 2]; // SoA, upper triangular matrix only + } else { + A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + (ulong) 1) / (ulong) 2]; // SoA, upper triangular matrix only + } + B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data - // perform the dot product calculation - for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { +#if defined(PLSSVM_OPENCL_TARGET_CPUS) + // perform the dot product calculation, the dim is the fastest moving index for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum = 0.0; + for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp[internal_i][internal_j] += sum; + } + } +#else + // perform the dot product calculation, the dim is the slowest moving index + for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } +#endif + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current work-item + const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const ulong global_i = i + (ulong) internal_i; - const ulong device_global_j = j + (ulong) internal_j; - const ulong global_j = row_offset + j + (ulong) internal_j; + // calculate the indices to access the global data and the data with respect to the current device + const ulong global_i_idx = i_idx + (ulong) internal_i; + const ulong device_global_j_idx = j_idx + (ulong) internal_j; + const ulong global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && device_global_j < device_specific_num_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i]; + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -109,8 +121,8 @@ __kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, cons * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -119,68 +131,85 @@ __kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, cons * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__kernel void device_kernel_symm_mirror(const ulong num_rows, const ulong num_rhs, const ulong num_mirror_rows, const ulong device_specific_num_rows, const ulong row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) { +__kernel void device_kernel_symm_mirror(const ulong num_rows, const ulong num_rhs, const ulong num_mirror_rows, const ulong device_num_rows, const ulong device_row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) { // cast values to 32-bit unsigned int values to prevent implicit conversions const uint local_id_0 = get_local_id(0); const uint local_id_1 = get_local_id(1); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large - // calculate the indices used in the current work-item - const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; // #rhs - const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; // # row - const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - - // create the local memory arrays used for caching data point features - __local real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __local real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create two local memory arrays used for caching + __local real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - // create a thread private array used for internal caching + // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 }; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ul) { - // load data into local memory - for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_i = i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - const ulong global_j = j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ul) - (dim + threadIdx_y - (ulong) 1) * (dim + threadIdx_y) / (ulong) 2 + device_specific_num_rows - (dim + get_local_id(1)) + global_j]; - A_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows - row_offset + PADDING_SIZE_ul) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ul - (ulong) 1) * (dim + get_local_id(1) + THREAD_BLOCK_SIZE_ul) / (ulong) 2 + device_specific_num_rows - (dim + get_local_id(1) + THREAD_BLOCK_SIZE_ul) + global_j]; - B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ul) + global_i]; - B_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rhs + PADDING_SIZE_ul) + global_i]; - } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rhs + const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (ulong dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const ulong global_i_idx_linear = i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + const ulong global_j_idx_linear = j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - (ulong) 1) * (dim_block + threadIdx_y) / (ulong) 2 + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { +#if defined(PLSSVM_OPENCL_TARGET_CPUS) + // perform the dot product calculation, the dim is the fastest moving index for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum = 0.0; + for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp[internal_i][internal_j] += sum; } } +#else + // perform the dot product calculation, the dim is the slowest moving index + for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } + } +#endif + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current work-item + const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const ulong global_i = i + (ulong) internal_i; - const ulong partial_global_j = j + (ulong) internal_j; - const ulong global_j = row_offset + device_specific_num_rows + j + (ulong) internal_j; + // calculate the indices to access the global data and the data with respect to the current device + const ulong global_i_idx = i_idx + (ulong) internal_i; + const ulong partial_global_j_idx = j_idx + (ulong) internal_j; + const ulong global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; - // be sure to not perform out of bounds accesses - if (global_i < num_rhs && partial_global_j < num_mirror_rows) { - C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i]; + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -196,23 +225,24 @@ __kernel void device_kernel_symm_mirror(const ulong num_rows, const ulong num_rh */ __kernel void device_kernel_inplace_matrix_add(const ulong num_cols, real_type __global *lhs, const real_type __global *rhs, const ulong grid_x_offset, const ulong grid_y_offset) { // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large - // calculate the indices used in the current thread - const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; // # num_rows - const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; // # num_rhs + // calculate the indices used in the current work-item + const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const ulong global_i = i + (ulong) internal_i; - const ulong global_j = j + (ulong) internal_j; + // calculate the indices to access the global data + const ulong global_i_idx = i_idx + (ulong) internal_i; + const ulong global_j_idx = j_idx + (ulong) internal_j; - lhs[global_i * (num_cols + PADDING_SIZE_ul) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ul) + global_j]; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -227,23 +257,24 @@ __kernel void device_kernel_inplace_matrix_add(const ulong num_cols, real_type _ */ __kernel void device_kernel_inplace_matrix_scale(const ulong num_cols, real_type __global *lhs, const real_type scale, const ulong grid_x_offset, const ulong grid_y_offset) { // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large - // calculate the indices used in the current thread - const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; // # num_rows - const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; // # num_rhs + // calculate the indices used in the current work-item + const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const ulong global_i = i + (ulong) internal_i; - const ulong global_j = j + (ulong) internal_j; + // calculate the indices to access the global data + const ulong global_i_idx = i_idx + (ulong) internal_i; + const ulong global_j_idx = j_idx + (ulong) internal_j; - lhs[global_i * (num_cols + PADDING_SIZE_ul) + global_j] *= scale; + lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale; // SoA } } } diff --git a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl index 481945ca6..34f6afb48 100644 --- a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl +++ b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl @@ -14,11 +14,11 @@ /** * @brief Create the explicit kernel matrix using the kernel function determined at runtime. * @details The `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER`, `PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION`, and `PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION` placeholder will be replaced by the correct values upon kernel construction. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -27,78 +27,96 @@ * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen) */ -__kernel void device_kernel_assembly(__global real_type *kernel_matrix_d, const __global real_type *data_d, const ulong num_rows, const ulong device_num_rows, const ulong row_offset, const ulong num_features, const __global real_type *q, const real_type QA_cost, const real_type cost, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) { +__kernel void device_kernel_assembly(__global real_type *kernel_matrix, const __global real_type *data, const ulong num_rows, const ulong device_num_rows, const ulong device_row_offset, const ulong num_features, const __global real_type *q, const real_type QA_cost, const real_type cost, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) { // cast values to 32-bit unsigned int values to prevent implicit conversions const uint local_id_0 = get_local_id(0); const uint local_id_1 = get_local_id(1); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large - // calculate the indices used in the current thread - const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; - const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; - const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - - // create the local memory arrays used for caching data point features - __local real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __local real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create two local memory arrays used for caching + __local real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_x >= blockIdx_y) { - // create a thread private array used for internal caching + // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 }; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) { - // load data into local memory - for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i]; - data_cache_i[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i]; - data_cache_j[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j]; - data_cache_j[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j]; - } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const ulong global_i_idx_linear = device_row_offset + i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + const ulong global_j_idx_linear = device_row_offset + j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { +#if defined(PLSSVM_OPENCL_TARGET_CPUS) + // perform the feature reduction calculation, the feature is the fastest moving index for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_i[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_cache_j[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum = 0.0; + for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp[internal_i][internal_j] += sum; } } +#else + // perform the feature reduction calculation, the feature is the slowest moving index + for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } + } +#endif + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current work-item + const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const ulong device_global_i = i + (ulong) internal_i; - const ulong global_i = row_offset + i + (ulong) internal_i; - const ulong device_global_j = j + (ulong) internal_j; - const ulong global_j = row_offset + j + (ulong) internal_j; + // calculate the indices to access the global data and the data with respect to the current device + const ulong device_global_i_idx = i_idx + (ulong) internal_i; + const ulong global_i_idx = device_row_offset + device_global_i_idx; + const ulong device_global_j_idx = j_idx + (ulong) internal_j; + const ulong global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) { + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; - temp_ij = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp_ij PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i] - q[global_j]; + // apply the final kernel function + temp_ij = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp_ij PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ul) - device_global_j * (device_global_j + (ulong) 1) / (ulong) 2 + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + (ulong) 1) / (ulong) 2 + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl b/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl index cbcbea498..aecb2ab8b 100644 --- a/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl +++ b/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl @@ -19,10 +19,10 @@ * @note The beta factor is already applied to C before this kernel starts! * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -33,172 +33,203 @@ * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen) */ -__kernel void device_kernel_assembly_symm(const real_type alpha, const __global real_type *q, const __global real_type *data_d, const ulong num_rows, const ulong device_num_rows, const ulong row_offset, const ulong num_features, const real_type QA_cost, const real_type cost, const __global real_type *B, __global real_type *C, const ulong num_classes, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) { +__kernel void device_kernel_assembly_symm(const real_type alpha, const __global real_type *q, const __global real_type *data, const ulong num_rows, const ulong device_num_rows, const ulong device_row_offset, const ulong num_features, const real_type QA_cost, const real_type cost, const __global real_type *B, __global real_type *C, const ulong num_classes, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) { // cast values to 32-bit unsigned int values to prevent implicit conversions const uint local_id_0 = get_local_id(0); const uint local_id_1 = get_local_id(1); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; - const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; - const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - - // create the local memory arrays used for caching data point features - __local real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __local real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_rows - device_row_offset + const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // device_num_rows + + // create two local memory arrays used for caching + __local real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // only calculate the upper triangular matrix -> can't use threadIdx since all work-items in a warp must progress further if (blockIdx_x >= blockIdx_y) { // create a thread private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 }; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) { - // load data into local memory - for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_i[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i]; - data_cache_i[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i]; - data_cache_j[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j]; - data_cache_j[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j]; - } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// + { + // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + __local real_type(*data_i_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one; + __local real_type(*data_j_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two; + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const ulong global_i_idx_linear = device_row_offset + i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + const ulong global_j_idx_linear = device_row_offset + j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; - // perform the feature reduction calculation - for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + // store the values in the local memory + data_i_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + +#if defined(PLSSVM_OPENCL_TARGET_CPUS) + // perform the feature reduction calculation, the feature is the fastest moving index for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_i[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_cache_j[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum = 0.0; + for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp[internal_i][internal_j] += sum; } } +#else + // perform the feature reduction calculation, the feature is the slowest moving index + for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } + } +#endif + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } // apply the remaining part of the kernel function and store the value in the output kernel matrix for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const ulong global_i = row_offset + i + (ulong) internal_i; - const ulong device_global_i = i + (ulong) internal_i; - const ulong global_j = row_offset + j + (ulong) internal_j; - const ulong device_global_j = j + (ulong) internal_j; - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) { - temp[internal_i][internal_j] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_i][internal_j] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i] - q[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const ulong device_global_i_idx = i_idx + (ulong) internal_i; + const ulong global_i_idx = device_row_offset + device_global_i_idx; + const ulong device_global_j_idx = j_idx + (ulong) internal_j; + const ulong global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_i][internal_j] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost; } } else { + // be sure to set the value to zero otherwise temp[internal_i][internal_j] = (real_type) 0.0; } } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // reinterpret cache arrays with interchanged dimensions - __local real_type (*B_cache)[FEATURE_BLOCK_SIZE] = (__local real_type (*)[FEATURE_BLOCK_SIZE]) data_cache_i; - __local real_type (*C_out_cache)[FEATURE_BLOCK_SIZE] = (__local real_type (*)[FEATURE_BLOCK_SIZE]) data_cache_j; + // reinterpret the local memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + __local real_type(*B_cache)[THREAD_BLOCK_SIZE] = (__local real_type(*)[THREAD_BLOCK_SIZE]) cache_one; + __local real_type(*C_out_cache)[THREAD_BLOCK_SIZE] = (__local real_type(*)[THREAD_BLOCK_SIZE]) cache_two; // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ul) { + for (ulong class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const ulong global_i_idx_linear = device_row_offset + i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = alpha * B[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y]; - B_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1 + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul]; - C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = (real_type) 0.0; - C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1 + THREAD_BLOCK_SIZE] = (real_type) 0.0; + // store the values in the local memory + B_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = (real_type) 0.0; // SoA } barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data // calculate intermediate results and store them in local memory - for (uint class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (uint class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_0) % FEATURE_BLOCK_SIZE] += - temp[internal_i][internal_j] * B_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_0) % FEATURE_BLOCK_SIZE]; + C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_0) % THREAD_BLOCK_SIZE] += + temp[internal_i][internal_j] * B_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_0) % THREAD_BLOCK_SIZE]; } } barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_j = row_offset + j + (ulong) internal; - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_x], C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal][local_id_0]); - atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ul], C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal][local_id_0 + THREAD_BLOCK_SIZE]); + // calculate the indices to access the global data + const ulong global_j_idx = device_row_offset + j_idx + (ulong) internal; + + atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal][local_id_0]); // SoA } - barrier(CLK_LOCAL_MEM_FENCE); // wai until all threads updated C with their values + barrier(CLK_LOCAL_MEM_FENCE); // wai until all work-items updated C with their values } } // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const ulong global_i = row_offset + i + (ulong) internal_i; - const ulong global_j = row_offset + j + (ulong) internal_j; + // calculate the indices to access the global data + const ulong global_i_idx = device_row_offset + i_idx + (ulong) internal_i; + const ulong global_j_idx = device_row_offset + j_idx + (ulong) internal_j; - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = (real_type) 0.0; } } } - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // reinterpret cache arrays with interchanged dimensions - __local real_type (*B_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type (*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_i; - __local real_type (*C_out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type (*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_j; + // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + __local real_type(*B_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one; + __local real_type(*C_out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two; // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ul) { + for (ulong class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const ulong global_j_idx_linear = device_row_offset + j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha * B[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y]; - B_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha * B[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul]; + // store the values in the local memory + B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y]; // SoA C_out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0; - C_out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0; } barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data - // calculate intermediate results and store them in shared memory - for (uint class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + // calculate intermediate results and store them in local memory + for (uint class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_0] += - temp[internal_i][internal_j] * B_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_0] += + temp[internal_i][internal_j] * B_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]; } } barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_i = row_offset + i + (ulong) internal; - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y], C_out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]); - atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul], C_out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0]); + // calculate the indices to access the global data + const ulong global_i_idx = device_row_offset + i_idx + (ulong) internal; + + atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]); // SoA } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all threads updated C with their values + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items updated C with their values } } } diff --git a/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl b/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl new file mode 100644 index 000000000..76c0ba424 --- /dev/null +++ b/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl @@ -0,0 +1,40 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Implement a fill kernel using OpenCL. + */ + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +/** + * @brief Fill the float data pointer @p data with the @p value. + * @param[out] data the pointer to fill with values + * @param[in] value the value used to fill @p data + * @param[in] pos the start position for filling @p data + * @param[in] size the number of elements in @p data + */ +__kernel void device_fill_kernel_float(__global float *data, const float value, const ulong pos, const ulong size) { + const ulong idx = get_global_id(0); + if (idx < size) { + data[pos + idx] = value; + } +} + +/** + * @brief Fill the double data pointer @p data with the @p value. + * @param[out] data the pointer to fill with values + * @param[in] value the value used to fill @p data + * @param[in] pos the start position for filling @p data + * @param[in] size the number of elements in @p data + */ +__kernel void device_fill_kernel_double(__global double *data, const double value, const ulong pos, const ulong size) { + const ulong idx = get_global_id(0); + if (idx < size) { + data[pos + idx] = value; + } +} diff --git a/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl b/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl new file mode 100644 index 000000000..88b4f67e1 --- /dev/null +++ b/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl @@ -0,0 +1,46 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Implement a memset kernel using OpenCL. + */ + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +/** + * @brief Memset the float data pointer @p data with the @p value. + * @param[out] data the pointer to memset with the pattern + * @param[in] pattern the pattern used to memset @p data + * @param[in] pos the start position for the memset operation on @p data + * @param[in] size the number of elements in @p data + */ +__kernel void device_memset_kernel_float(__global float *data, const uchar pattern, const ulong pos, const ulong size) { + const ulong idx = get_global_id(0); + if (idx < size) { + // pack the 1-Byte pattern into a 4-Byte uint + const uint packed_pattern = (pattern << 24) | (pattern << 16) | (pattern << 8) | pattern; + // bitwise cast the uint to a float + data[pos + idx] = as_float(packed_pattern); + } +} + +/** + * @brief Memset the double data pointer @p data with the @p value. + * @param[out] data the pointer to memset with the pattern + * @param[in] pattern the pattern used to memset @p data + * @param[in] pos the start position for the memset operation on @p data + * @param[in] size the number of elements in @p data + */ +__kernel void device_memset_kernel_double(__global double *data, const uchar pattern, const ulong pos, const ulong size) { + const ulong idx = get_global_id(0); + if (idx < size) { + // pack the 1-Byte pattern into an 8-Byte ulong + const ulong packed_pattern = ((ulong) pattern << 56) | ((ulong) pattern << 48) | ((ulong) pattern << 40) | ((ulong) pattern << 32) | ((ulong) pattern << 24) | ((ulong) pattern << 16) | ((ulong) pattern << 8) | ((ulong) pattern); + // bitwise cast th ulong to a double + data[pos + idx] = as_double(packed_pattern); + } +} diff --git a/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl b/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl index 286c9db05..70b66e305 100644 --- a/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl +++ b/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl @@ -75,7 +75,7 @@ real_type apply_linear_kernel_function(const real_type value) { /** * @brief Compute the polynomial kernel function using @p value. - * @details Uses a custom power implementation taking advantage of the fact that degree can only be a positive integer. Hardcodes the power function for degrees <= 6. + * @details Uses a custom power implementation taking advantage of the fact that degree can only be a positive integer. * @param[in] value the value to apply the polynomial kernel function to * @param[in] degree the degree parameter of the polynomial kernel function * @param[in] gamma the gamma parameter of the polynomial kernel function @@ -84,36 +84,12 @@ real_type apply_linear_kernel_function(const real_type value) { */ real_type apply_polynomial_kernel_function(const real_type value, const int degree, const real_type gamma, const real_type coef0) { const real_type base = gamma * value + coef0; - switch (degree) { - case 0: return (real_type) 1.0; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result = 1.0; - for (int i = 0; i < degree; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result = 1.0; + for (int i = 0; i < degree; ++i) { + result *= base; } + return result; } /** diff --git a/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl b/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl index e37c1dbfb..42edc442f 100644 --- a/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl +++ b/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl @@ -14,13 +14,13 @@ #pragma OPENCL EXTENSION cl_khr_fp64 : enable /** - * @brief Predict the @p predict_points_d using the kernel function determined at runtime. + * @brief Predict the @p predict_points using the kernel function determined at runtime. * @details The `PLSSVM_DEVICE_KERNEL_PREDICT_NAME`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER`, `PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION`, and `PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION` placeholder will be replaced by the correct values upon kernel construction. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -29,107 +29,126 @@ * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen) */ -__kernel void PLSSVM_DEVICE_KERNEL_PREDICT_NAME(__global real_type *prediction_d, const __global real_type *alpha_d, const __global real_type *rho_d, const __global real_type *sv_d, const __global real_type *predict_points_d, const ulong num_classes, const ulong num_sv, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) { +__kernel void PLSSVM_DEVICE_KERNEL_PREDICT_NAME(__global real_type *prediction, const __global real_type *alpha, const __global real_type *rho, const __global real_type *support_vectors, const __global real_type *predict_points, const ulong num_classes, const ulong num_sv, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) { // cast values to 32-bit unsigned int values to prevent implicit conversions const uint local_id_0 = get_local_id(0); const uint local_id_1 = get_local_id(1); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; - const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - const ulong sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - - // create the local memory arrays used for caching data point features - __local real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __local real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // create a thread private array used for internal caching + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large + + // create two local memory arrays used for caching + __local real_type cache_one[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type cache_two[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 }; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) { - // load data into local memory - for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_pp_idx = pp_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - const ulong global_sv_idx = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx]; - data_cache_pp[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx]; - data_cache_sv[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ul) + global_sv_idx]; - data_cache_sv[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_sv + PADDING_SIZE_ul) + global_sv_idx]; - } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + { + // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + __local real_type(*pp_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one; + __local real_type(*sv_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two; + + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const ulong sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const ulong global_pp_idx_linear = pp_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + const ulong global_sv_idx_linear = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + pp_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA + } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { +#if defined(PLSSVM_OPENCL_TARGET_CPUS) + // perform the feature reduction calculation, the feature is the fastest moving index + for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_sv[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv], data_cache_pp[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pd]); + real_type sum = 0.0; + for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(sv_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv], pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + temp[internal_pp][internal_sv] += sum; } } +#else + // perform the feature reduction calculation, the feature is the slowest moving index + for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp[internal_pp][internal_sv] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(sv_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv], pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + } + } +#endif + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } // update temp using the respective kernel function - for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_pd][internal_sv] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER); + temp[internal_pp][internal_sv] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_pp][internal_sv] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER); } } { - // reinterpret cache arrays with interchanged dimensions - __local real_type(*alpha_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_pp; - __local real_type(*out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_sv; + // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + __local real_type(*alpha_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one; + __local real_type(*out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ul) { + // calculate the indices used in the current thread + const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const ulong sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (ulong class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_sv_idx = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - alpha_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ul) + global_sv_idx]; - alpha_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_sv + PADDING_SIZE_ul) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const ulong global_sv_idx_linear = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory + alpha_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_y == (ulong) 0) { - out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = -rho_d[dim + threadIdx_y]; - out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ul]; + out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = -rho[class_block + threadIdx_y]; } else { out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0; - out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0; } } barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data // calculate intermediate results and store them in shared memory - for (uint class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { - for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (uint class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_0] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_0] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv]; } } barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data const ulong global_pp_idx = pp_idx + (ulong) internal; - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y], out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]); - atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul], out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0]); + atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]); } barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items updated their part of the prediction } diff --git a/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl b/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl index 5844b3a3a..1d579b40d 100644 --- a/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl +++ b/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl @@ -14,148 +14,185 @@ #pragma OPENCL EXTENSION cl_khr_fp64 : enable /** - * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__kernel void device_kernel_w_linear(__global real_type *w_d, const __global real_type *alpha_d, const __global real_type *sv_d, const ulong num_classes, const ulong num_sv, const ulong device_specific_num_sv, const ulong sv_offset, const ulong grid_x_offset, const ulong grid_y_offset) { +__kernel void device_kernel_w_linear(__global real_type *w, const __global real_type *alpha, const __global real_type *support_vectors, const ulong num_classes, const ulong num_sv, const ulong device_num_sv, const ulong device_sv_offset, const ulong grid_x_offset, const ulong grid_y_offset) { // cast values to 32-bit unsigned int values to prevent implicit conversions const uint local_id_0 = get_local_id(0); const uint local_id_1 = get_local_id(1); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const ulong feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; - const ulong feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; - const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - - // create the local memory arrays used for caching data point features - __local real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __local real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // create a thread private array used for internal caching + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large + + // create two local memory arrays used for caching + __local real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 }; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (ulong sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ul) { - // load data into local memory - for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_feature_idx = feature_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - const ulong global_class_idx = class_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul; - - data_cache_feature[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ul) + sv + threadIdx_y]; // SoA - data_cache_alpha[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ul) + sv + sv_offset + threadIdx_y]; // AoS - } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const ulong feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_features + const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes + + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (ulong sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const ulong global_feature_idx_linear = feature_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + const ulong global_class_idx_linear = class_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + feature_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y]; // SoA + alpha_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y]; // AoS + } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data - // perform the dot product calculation - for (uint block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { +#if defined(PLSSVM_OPENCL_TARGET_CPUS) + // perform the dot product calculation, the sv is the fastest moving index for (uint internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_feature]; + real_type sum = 0.0; + for (uint sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + temp[internal_feature][internal_class] += sum; } } +#else + // perform the dot product calculation, the sv is the slowest moving index + for (uint sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (uint internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + } + } +#endif + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all threads performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current work-item + const ulong feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_features + const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (uint internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data const ulong global_feature_idx = feature_idx + (ulong) internal_feature; const ulong global_class_idx = class_idx + (ulong) internal_class; - w_d[global_feature_idx * (num_classes + PADDING_SIZE_ul) + global_class_idx] = temp[internal_feature][internal_class]; + w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ -__kernel void device_kernel_predict_linear(__global real_type *prediction_d, const __global real_type *w_d, const __global real_type *rho_d, const __global real_type *predict_points_d, const ulong num_classes, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset) { +__kernel void device_kernel_predict_linear(__global real_type *prediction, const __global real_type *w, const __global real_type *rho, const __global real_type *predict_points, const ulong num_classes, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset) { // cast values to 32-bit unsigned int values to prevent implicit conversions const uint local_id_0 = get_local_id(0); const uint local_id_1 = get_local_id(1); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const ulong threadIdx_x = get_local_id(0); // current thread in block x-dimension - const ulong threadIdx_y = get_local_id(1); // current thread in block y-dimension - const ulong blockDim_x = get_local_size(0); // number of threads in block x-dimension - const ulong blockDim_y = get_local_size(1); // number of threads in block y-dimension - const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current block in grid x-dimension + offsets if the grid size would be too large - const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current block in grid y-dimension + offsets if the grid size would be too large - - // calculate the indices used in the current thread - const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul; - const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul; - const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x; - - // create the local memory arrays used for caching data point features - __local real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - __local real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // create a thread private array used for internal caching + const ulong threadIdx_x = get_local_id(0); // current work-item in work-group x-dimension + const ulong threadIdx_y = get_local_id(1); // current work-item in work-group y-dimension + const ulong blockDim_x = get_local_size(0); // number of work-items in work-group x-dimension + const ulong blockDim_y = get_local_size(1); // number of work-items in work-group y-dimension + const ulong blockIdx_x = get_group_id(0) + grid_x_offset; // current work-group in global range x-dimension + offsets if the global range is too large + const ulong blockIdx_y = get_group_id(1) + grid_y_offset; // current work-group in global range y-dimension + offsets if the global range is too large + + // create two local memory arrays used for caching + __local real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + __local real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 }; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) { - // load data into local memory - for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const ulong global_pp_idx = pp_idx_linear + internal * THREAD_BLOCK_SIZE; - const ulong global_class_idx = class_idx_linear + internal * THREAD_BLOCK_SIZE; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx]; - data_cache_pp[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx]; - data_cache_w[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ul) + global_class_idx]; - data_cache_w[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_classes + PADDING_SIZE_ul) + global_class_idx]; - } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_predict_points + const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x; // num_classes + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const ulong global_pp_idx_linear = pp_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + const ulong global_class_idx_linear = class_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + pp_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items loaded their part of the data - // perform the dot product calculation - for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { +#if defined(PLSSVM_OPENCL_TARGET_CPUS) + // perform the feature reduction calculation, the feature is the fastest moving index + for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pd]; + real_type sum = 0.0; + for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + temp[internal_pp][internal_class] += sum; } } +#else + // perform the feature reduction calculation, the feature is the slowest moving index + for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + } + } +#endif + barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - barrier(CLK_LOCAL_MEM_FENCE); // wait until all work-items performed their part of the calculations } - // update global array with local one - for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current work-item + const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const ulong global_pp_idx = pp_idx + (ulong) internal_pd; + // calculate the indices to access the global data + const ulong global_pp_idx = pp_idx + (ulong) internal_pp; const ulong global_class_idx = class_idx + (ulong) internal_class; - prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ul) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx]; + prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx]; // AoS } } } diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp index e1041024a..01b1ec54a 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp @@ -21,7 +21,6 @@ #include // std::array #include // std::ceil #include // std::size_t -#include // std::vector namespace plssvm::openmp::detail { @@ -29,60 +28,65 @@ namespace plssvm::openmp::detail { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @param[in] num_rows the number of rows and columns in @p A * @param[in] num_rhs the number of rows in @p B and @p C - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); #pragma omp parallel for collapse(2) - for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) { - for (std::size_t row = 0; row < blocked_device_specific_num_rows; row += THREAD_BLOCK_SIZE_uz) { + for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_device_num_rows; row_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) { - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { + for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { // calculate the indices used in the current thread - const std::size_t rhs_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (rhs_block + rhs_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) { + // iterate over all values using blocking + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - real_type A_val = 0.0; - // determine on which side of the diagonal we are located - if (dim < global_row) { - A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_val = A[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }]; + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + real_type A_cache = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset); } - temp[internal_i][internal_j] += A_val * B(global_rhs, dim + row_offset); + temp[internal_j][internal_i] += sum; } } } @@ -90,13 +94,14 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t device_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } @@ -111,69 +116,75 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B * @param[in] beta the scalar beta value * @param[in,out] C the matrix @p C, also used as result matrix */ -inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { +inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); // calculate constants const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); const auto blocked_num_mirror_rows = static_cast(std::ceil(static_cast(num_mirror_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); #pragma omp parallel for collapse(2) - for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) { - for (std::size_t row = 0; row < blocked_num_mirror_rows; row += THREAD_BLOCK_SIZE_uz) { + for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_num_mirror_rows; row_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) { - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { + for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { // calculate the indices used in the current thread - const std::size_t rhs_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (rhs_block + rhs_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) { + // iterate over the remaining values using blocking + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { // perform the dot product calculation for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - const real_type A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row]; - temp[internal_i][internal_j] += A_val * B(global_rhs, row_offset + dim); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + sum += A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] * // SoA, upper triangular matrix only + B(global_i_idx, dim_block + dim + device_row_offset); // SoA + } + temp[internal_j][internal_i] += sum; } } } - // apply the (partial) BLAS operation and update C + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t partial_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) { - C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx); } } } diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp index 9403b12a1..70d2f9edb 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -28,83 +28,89 @@ namespace plssvm::openmp::detail { /** * @brief Assemble the kernel matrix using the @p kernel function. - * @tparam kernel the compile-time kernel function to use + * @tparam kernel_function the compile-time kernel function to use * @tparam Args the types of the potential additional arguments for the @p kernel function * @param[out] kernel_matrix the resulting kernel matrix * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] q the `q` vector * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function */ -template -void device_kernel_assembly(std::vector &kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { +template +void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { + PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!"); PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!"); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); // calculate constants const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); #pragma omp parallel for collapse(2) schedule(dynamic) - for (std::size_t row = 0; row < blocked_row_range; row += THREAD_BLOCK_SIZE_uz) { - for (std::size_t col = 0; col < blocked_device_specific_num_rows; col += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t col_block = 0; col_block < blocked_device_num_rows; col_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { - for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { + for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) { // calculate the indices used in the current thread - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (col_block + col_thread) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + real_type temp_ij = temp[internal_j][internal_i]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { + if (global_i_idx == global_j_idx) { temp_ij += cost; } - // update the kernel matrix - kernel_matrix[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index 771689209..332a0a26a 100644 --- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -26,25 +26,25 @@ namespace plssvm::openmp::detail { /** - * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @tparam kernel_function the compile-time kernel function to use + * @tparam Args the types of the potential additional arguments for the @p kernel_function function * @param[in] alpha the scalar alpha value * @param[in] q the `q` vector * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] B the matrix @p B * @param[in,out] C the matrix @p C - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function */ -template -inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { +template +inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!"); PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size()); @@ -53,68 +53,96 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector const std::size_t num_rows = data.num_rows() - 1; const std::size_t num_features = data.num_cols(); const std::size_t num_classes = B.num_rows(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); #pragma omp parallel for collapse(2) schedule(dynamic) - for (std::size_t row = 0; row < blocked_row_range; row += THREAD_BLOCK_SIZE_uz) { - for (std::size_t col = 0; col < blocked_device_specific_num_rows; col += THREAD_BLOCK_SIZE_uz) { + for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t col_block = 0; col_block < blocked_device_num_rows; col_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) { - for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) { + for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) { + for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) { // calculate the indices used in the current thread - const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t i_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (col_block + col_thread) * INTERNAL_BLOCK_SIZE_uz; // only calculate the upper triangular matrix - if (row_idx >= col_idx) { + if (i_idx >= j_idx) { // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); - temp[internal_row][internal_col] += detail::feature_reduce(data(global_row, dim), data(global_col, dim)); + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature)); + } + temp[internal_j][internal_i] += sum; } } } // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col]; + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_j][internal_i] = detail::apply_kernel_function(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx]; // apply the cost on the diagonal - if (global_row == global_col) { - temp_ij += cost; - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { + if (global_i_idx == global_j_idx) { + temp[internal_j][internal_i] += cost; + } + } else { + // be sure to set the value to zero otherwise + temp[internal_j][internal_i] = real_type{ 0.0 }; + } + } + } + + //*************************************************************************// + // calculate C += alpha * temp * B // + //*************************************************************************// + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + if (global_i_idx == global_j_idx) { + // only apply once to the diagonal + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - C(class_idx, global_row) += alpha * temp_ij * B(class_idx, global_row); + C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } else { - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { + // apply it for the upper and lower triangular matrix + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - C(class_idx, global_row) += alpha * temp_ij * B(class_idx, global_col); -// symmetry + C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx); + // symmetry #pragma omp atomic - C(class_idx, global_col) += alpha * temp_ij * B(class_idx, global_row); + C(class_block + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx); } } } diff --git a/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp b/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp index 59fd0f43c..359e2f8ff 100644 --- a/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp @@ -27,42 +27,17 @@ namespace plssvm::openmp::detail { /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] inline real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp index 407096055..741c696af 100644 --- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp @@ -31,28 +31,68 @@ namespace plssvm::openmp::detail { * @param[out] w the vector to speedup the linear prediction * @param[in] alpha the previously learned weights * @param[in] support_vectors the support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for */ -inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) { +inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) { PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() })); - PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows()); - PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_features = support_vectors.num_cols(); + const auto blocked_num_features = static_cast(std::ceil(static_cast(num_features) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); -#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(num_classes, num_features, device_specific_num_sv, sv_offset) - for (std::size_t a = 0; a < num_classes; ++a) { - for (std::size_t dim = 0; dim < num_features; ++dim) { - real_type temp{ 0.0 }; -#pragma omp simd reduction(+ : temp) - for (std::size_t idx = 0; idx < device_specific_num_sv; ++idx) { - temp = std::fma(alpha(a, sv_offset + idx), support_vectors(sv_offset + idx, dim), temp); + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + +#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, device_sv_offset) + for (std::size_t feature_block = 0; feature_block < blocked_num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) { + // perform operations on the current block + for (std::size_t feature_thread = 0; feature_thread < THREAD_BLOCK_SIZE_uz; ++feature_thread) { + for (std::size_t class_thread = 0; class_thread < THREAD_BLOCK_SIZE_uz; ++class_thread) { + // calculate the indices used in the current thread + const std::size_t feature_idx = (feature_block + feature_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (class_block + class_thread) * INTERNAL_BLOCK_SIZE_uz; + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + // iterate over all support vectors using blocking + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + sum += alpha(global_class_idx, device_sv_offset + sv_block + sv) * support_vectors(device_sv_offset + sv_block + sv, global_feature_idx); + } + temp[internal_class][internal_feature] += sum; + } + } + } + + // store the result back to the w vector + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); + + w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature]; + } + } + } } - w(a, dim) = temp; } } } @@ -63,29 +103,69 @@ inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) { +inline void device_kernel_predict_linear(aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) { PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size()); PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants const std::size_t num_classes = prediction.num_cols(); const std::size_t num_features = predict_points.num_cols(); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); -#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(num_classes, num_features, device_specific_num_predict_points, row_offset) - for (std::size_t point_index = 0; point_index < device_specific_num_predict_points; ++point_index) { - for (std::size_t a = 0; a < num_classes; ++a) { - real_type temp{ 0.0 }; -#pragma omp simd reduction(+ : temp) - for (std::size_t dim = 0; dim < num_features; ++dim) { - temp = std::fma(w(a, dim), predict_points(row_offset + point_index, dim), temp); + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + +#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(blocked_device_num_predict_points, blocked_num_classes, device_num_predict_points, num_classes, num_features, device_row_offset) + for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) { + // perform operations on the current block + for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) { + for (std::size_t class_thread = 0; class_thread < THREAD_BLOCK_SIZE_uz; ++class_thread) { + // calculate the indices used in the current thread + const std::size_t pp_idx = (pp_block + pp_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (class_block + class_thread) * INTERNAL_BLOCK_SIZE_uz; + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + // perform the dot product calculation + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += w(global_class_idx, feature_block + feature) * predict_points(global_pp_idx, feature_block + feature); + } + temp[internal_class][internal_pp] += sum; + } + } + } + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx]; + } + } + } } - prediction(row_offset + point_index, a) = temp - rho[a]; } } } @@ -99,59 +179,64 @@ inline void device_kernel_predict_linear(aos_matrix &prediction, cons * @param[in] rho the previously learned bias * @param[in] support_vectors the support vectors * @param[in] predict_points the data points to predict - * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for - * @param[in] row_offset the first row in @p predict_points the current device is responsible for + * @param[in] device_num_predict_points the number of predict points the current device is responsible for + * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ -template -inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) { +template +inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) { PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size()); PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols()); PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); // calculate constants const std::size_t num_classes = alpha.num_rows(); const std::size_t num_support_vectors = support_vectors.num_rows(); - const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); const std::size_t num_features = predict_points.num_cols(); + const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); #pragma omp parallel for collapse(2) - for (std::size_t point_index = 0; point_index < device_specific_num_predict_points; ++point_index) { - for (std::size_t a = 0; a < num_classes; ++a) { - prediction(row_offset + point_index, a) -= rho[a]; + for (std::size_t pp_idx = 0; pp_idx < device_num_predict_points; ++pp_idx) { + for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { + prediction(device_row_offset + pp_idx, class_idx) -= rho[class_idx]; } } #pragma omp parallel for collapse(2) - for (std::size_t pp = 0; pp < blocked_device_specific_num_predict_points; pp += THREAD_BLOCK_SIZE_uz) { - for (std::size_t sv = 0; sv < blocked_num_support_vectors; sv += THREAD_BLOCK_SIZE_uz) { + for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) { + for (std::size_t sv_block = 0; sv_block < blocked_num_support_vectors; sv_block += THREAD_BLOCK_SIZE_uz) { // perform operations on the current block - for (std::size_t pp_block = 0; pp_block < THREAD_BLOCK_SIZE_uz; ++pp_block) { - for (std::size_t sv_block = 0; sv_block < THREAD_BLOCK_SIZE_uz; ++sv_block) { + for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) { + for (std::size_t sv_thread = 0; sv_thread < THREAD_BLOCK_SIZE_uz; ++sv_thread) { // calculate the indices used in the current thread - const std::size_t pp_idx = (pp + pp_block) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = (sv + sv_block) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t pp_idx = (pp_block + pp_thread) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t sv_idx = (sv_block + sv_thread) * INTERNAL_BLOCK_SIZE_uz; // create a thread private array used for internal caching std::array, INTERNAL_BLOCK_SIZE> temp{}; - // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { // perform the feature reduction calculation for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - temp[internal_pp][internal_sv] += detail::feature_reduce(support_vectors(global_sv_idx, dim), predict_points(global_pp_idx, dim)); + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature)); + } + temp[internal_sv][internal_pp] += sum; } } } @@ -159,22 +244,21 @@ inline void device_kernel_predict(aos_matrix &prediction, const aos_m // update temp using the respective kernel function for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); + temp[internal_sv][internal_pp] = detail::apply_kernel_function(temp[internal_sv][internal_pp], kernel_function_parameter...); } } - // add results to prediction - for (std::size_t a = 0; a < num_classes; ++a) { + // atomically add the results to the prediction + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - // be sure to not perform out of bounds accesses - if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) { + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { #pragma omp atomic - prediction(global_pp_idx, a) += alpha(a, global_sv_idx) * temp[internal_pp][internal_sv]; + prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_sv][internal_pp]; } } } diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp index 55b6a746b..b21d95619 100644 --- a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp +++ b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp @@ -18,7 +18,7 @@ #include "plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp" // plssvm::adaptivecpp::detail::device_ptr #include "plssvm/backends/SYCL/AdaptiveCpp/detail/pinned_memory.hpp" // plssvm::adaptivecpp::detail::pinned_memory #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue.hpp" // plssvm::adaptivecpp::detail::queue (PImpl) -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/constants.hpp" // plssvm::real_type #include "plssvm/detail/igor_utility.hpp" // plssvm::detail::get_value_from_named_parameter #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size @@ -61,7 +61,7 @@ class csvm : public ::plssvm::detail::gpu_csvm(named_args)... }; - // check whether a specific SYCL kernel invocation type has been requested - if constexpr (parser.has(sycl_kernel_invocation_type)) { + // check whether a specific SYCL data parallel kernel has been requested + if constexpr (parser.has(sycl_data_parallel_kernel)) { // compile time check: the value must have the correct type - invocation_type_ = ::plssvm::detail::get_value_from_named_parameter(parser, sycl_kernel_invocation_type); + data_parallel_kernel_type_ = ::plssvm::detail::get_value_from_named_parameter(parser, sycl_data_parallel_kernel); #if !defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) - if (invocation_type_ == sycl::kernel_invocation_type::hierarchical) { - throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::hierarchical is disabled for the AdaptiveCpp SYCL backend!" }; - } else if (invocation_type_ == sycl::kernel_invocation_type::scoped) { - throw ::plssvm::invalid_parameter_exception{ "he provided sycl::kernel_invocation_type::scoped is disabled for the AdaptiveCpp SYCL backend!" }; + if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::hierarchical) { + throw ::plssvm::invalid_parameter_exception{ "The provided sycl::data_parallel_kernel::hierarchical is disabled for the AdaptiveCpp SYCL backend!" }; + } else if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::scoped) { + throw ::plssvm::invalid_parameter_exception{ "he provided sycl::data_parallel_kernel::scoped is disabled for the AdaptiveCpp SYCL backend!" }; } #endif } @@ -112,10 +112,10 @@ class csvm : public ::plssvm::detail::gpu_csvm /** * @brief Convert the provided @p grid and @p block to the final SYCL execution range. - * @tparam invocation_type the SYCL kernel invocation type + * @tparam kernel_type the SYCL data parallel kernel * @param[in] grid the execution grid * @param[in] block the execution block * @return the SYCL native execution range */ -template +template auto get_execution_range(const ::plssvm::detail::dim_type &grid, const ::plssvm::detail::dim_type &block) { const ::sycl::range native_grid = detail::dim_type_to_native<2>(grid); const ::sycl::range native_block = detail::dim_type_to_native<2>(block); - if constexpr (invocation_type == sycl::kernel_invocation_type::basic) { + if constexpr (kernel_type == sycl::data_parallel_kernel::basic) { return ::sycl::range<2>{ native_grid * native_block }; - } else if constexpr (invocation_type == sycl::kernel_invocation_type::work_group) { + } else if constexpr (kernel_type == sycl::data_parallel_kernel::work_group) { return ::sycl::nd_range<2>{ native_grid * native_block, native_block }; - } else if constexpr (invocation_type == sycl::kernel_invocation_type::hierarchical || invocation_type == sycl::kernel_invocation_type::scoped) { + } else if constexpr (kernel_type == sycl::data_parallel_kernel::hierarchical || kernel_type == sycl::data_parallel_kernel::scoped) { return ::sycl::nd_range<2>{ native_grid, native_block }; } else { // can't be reached diff --git a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp index 4b1a6b570..f322cb877 100644 --- a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp +++ b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp @@ -15,10 +15,10 @@ #include "plssvm/backends/execution_range.hpp" // plssvm::detail::{dim_type, execution_range} #include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp" // plssvm::dpcpp::detail::device_ptr #include "plssvm/backends/SYCL/DPCPP/detail/pinned_memory.hpp" // plssvm::dpcpp::detail::pinned_memory #include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp" // plssvm::dpcpp::detail::queue (PImpl) -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type #include "plssvm/constants.hpp" // plssvm::real_type #include "plssvm/detail/igor_utility.hpp" // plssvm::detail::get_value_from_named_parameter #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size @@ -64,7 +64,7 @@ class csvm : public ::plssvm::detail::gpu_csvm(named_args)... }; - // check whether a specific SYCL kernel invocation type has been requested - if constexpr (parser.has(sycl_kernel_invocation_type)) { + // check whether a specific SYCL data parallel kernel has been requested + if constexpr (parser.has(sycl_data_parallel_kernel)) { // compile time check: the value must have the correct type - invocation_type_ = ::plssvm::detail::get_value_from_named_parameter(parser, sycl_kernel_invocation_type); - // the invocation type "scoped" isn't supported by DPC++ - if (invocation_type_ == sycl::kernel_invocation_type::scoped) { - throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::scoped isn't supported by DPC++!" }; + data_parallel_kernel_type_ = ::plssvm::detail::get_value_from_named_parameter(parser, sycl_data_parallel_kernel); + // the data parallel kernel "scoped" isn't supported by DPC++ + if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::scoped) { + throw ::plssvm::invalid_parameter_exception{ "The provided sycl::data_parallel_kernel::scoped isn't supported by DPC++!" }; } #if !defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED) - if (invocation_type_ == sycl::kernel_invocation_type::hierarchical) { - throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::hierarchical is disabled for the DPC++ SYCL backend!" }; + if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::hierarchical) { + throw ::plssvm::invalid_parameter_exception{ "The provided sycl::data_parallel_kernel::hierarchical is disabled for the DPC++ SYCL backend!" }; } #endif } @@ -114,10 +114,10 @@ class csvm : public ::plssvm::detail::gpu_csvm /** * @brief Convert the provided @p grid and @p block to the final SYCL execution range. - * @tparam invocation_type the SYCL kernel invocation type + * @tparam kernel_type the SYCL data parallel kernel * @param[in] grid the execution grid * @param[in] block the execution block * @return the SYCL native execution range */ -template +template auto get_execution_range(const ::plssvm::detail::dim_type &grid, const ::plssvm::detail::dim_type &block) { const ::sycl::range native_grid = detail::dim_type_to_native<2>(grid); const ::sycl::range native_block = detail::dim_type_to_native<2>(block); - if constexpr (invocation_type == sycl::kernel_invocation_type::basic) { + if constexpr (kernel_type == sycl::data_parallel_kernel::basic) { return ::sycl::range<2>{ native_grid * native_block }; - } else if constexpr (invocation_type == sycl::kernel_invocation_type::work_group) { + } else if constexpr (kernel_type == sycl::data_parallel_kernel::work_group) { return ::sycl::nd_range<2>{ native_grid * native_block, native_block }; - } else if constexpr (invocation_type == sycl::kernel_invocation_type::hierarchical) { + } else if constexpr (kernel_type == sycl::data_parallel_kernel::hierarchical) { return ::sycl::nd_range<2>{ native_grid, native_block }; } else { // can't be reached diff --git a/include/plssvm/backends/SYCL/kernel_invocation_types.hpp b/include/plssvm/backends/SYCL/data_parallel_kernels.hpp similarity index 54% rename from include/plssvm/backends/SYCL/kernel_invocation_types.hpp rename to include/plssvm/backends/SYCL/data_parallel_kernels.hpp index d7cec1924..ede8ee3fb 100644 --- a/include/plssvm/backends/SYCL/kernel_invocation_types.hpp +++ b/include/plssvm/backends/SYCL/data_parallel_kernels.hpp @@ -6,11 +6,11 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Defines an enumeration holding all possible SYCL kernel invocation types. + * @brief Defines an enumeration holding all possible SYCL data parallel kernels. */ -#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_INVOCATION_TYPE_HPP_ -#define PLSSVM_BACKENDS_SYCL_KERNEL_INVOCATION_TYPE_HPP_ +#ifndef PLSSVM_BACKENDS_SYCL_DATA_PARALLEL_KERNELS_HPP_ +#define PLSSVM_BACKENDS_SYCL_DATA_PARALLEL_KERNELS_HPP_ #pragma once #include "fmt/base.h" // fmt::formatter @@ -22,10 +22,10 @@ namespace plssvm::sycl { /** - * @brief Enum class for all possible SYCL kernel invocation types. + * @brief Enum class for all possible SYCL data parallel kernels. */ -enum class kernel_invocation_type { - /** Use the best kernel invocation type for the current SYCL implementation and target hardware platform. */ +enum class data_parallel_kernel { + /** Use the best data parallel kernel for the current SYCL implementation and target hardware platform. In practice, will nearly always map to work-group data parallel kernels. */ automatic, /** Use the [`basic` data parallel kernels](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_basic_data_parallel_kernels). */ basic, @@ -38,35 +38,35 @@ enum class kernel_invocation_type { }; /** - * @brief Return a list of all currently available SYCL kernel invocation types. - * @details SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation type can be disabled during the CMake configuration. - * @return the available SYCL kernel invocation types (`[[nodiscard]]`) + * @brief Return a list of all currently available SYCL data parallel kernels. + * @details SYCL's hierarchical data parallel kernels and AdaptiveCpp's scoped parallelism can be disabled during the CMake configuration. + * @return the available SYCL data parallel kernels (`[[nodiscard]]`) */ -[[nodiscard]] std::vector list_available_sycl_kernel_invocation_types(); +[[nodiscard]] std::vector list_available_sycl_data_parallel_kernels(); /** - * @brief Output the @p invocation type to the given output-stream @p out. + * @brief Output the @p kernel_type type to the given output-stream @p out. * @param[in,out] out the output-stream to write the backend type to - * @param[in] invocation the SYCL kernel invocation type + * @param[in] kernel_type the SYCL data parallel kernel * @return the output-stream */ -std::ostream &operator<<(std::ostream &out, kernel_invocation_type invocation); +std::ostream &operator<<(std::ostream &out, data_parallel_kernel kernel_type); /** - * @brief Use the input-stream @p in to initialize the @p invocation type. + * @brief Use the input-stream @p in to initialize the @p kernel_type type. * @param[in,out] in input-stream to extract the backend type from - * @param[in] invocation the SYCL kernel invocation type + * @param[in] kernel_type the SYCL data parallel kernel * @return the input-stream */ -std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation); +std::istream &operator>>(std::istream &in, data_parallel_kernel &kernel_type); } // namespace plssvm::sycl /// @cond Doxygen_suppress template <> -struct fmt::formatter : fmt::ostream_formatter { }; +struct fmt::formatter : fmt::ostream_formatter { }; /// @endcond -#endif // PLSSVM_BACKENDS_SYCL_KERNEL_INVOCATION_TYPE_HPP_ +#endif // PLSSVM_BACKENDS_SYCL_DATA_PARALLEL_KERNELS_HPP_ diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp index 2e528149c..120f637b9 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::basic { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -41,11 +48,11 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -59,33 +66,63 @@ class device_kernel_symm { * @param[in] idx indices representing the current point in the execution space */ void operator()(::sycl::item<2> idx) const { - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (unsigned long long dim = 0; dim < (num_rows_ - row_offset_); ++dim) { - // perform the dot product calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); - - real_type A_val = 0.0; - // determine on which side of the diagonal we are located - if (dim < global_j) { - A_val = A_[dim * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_val = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + // iterate over all values using blocking + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + real_type A_cache = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_cache = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + + sum += A_cache * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type A_cache = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_cache = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } + + temp[internal_i][internal_j] += A_cache * B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } } - - temp[internal_i][internal_j] += A_val * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; } } } @@ -93,13 +130,14 @@ class device_kernel_symm { // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -109,8 +147,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -125,16 +163,21 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -143,12 +186,12 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -162,25 +205,49 @@ class device_kernel_symm_mirror { * @param[in] idx indices representing the current point in the execution space */ void operator()(::sycl::item<2> idx) const { - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); - - temp[internal_i][internal_j] += A_[(dim) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows_ - dim + global_j] * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] * // SoA, upper triangular matrix only + B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] * // SoA, upper triangular matrix only + B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + } } } } @@ -188,13 +255,14 @@ class device_kernel_symm_mirror { // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -205,8 +273,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -223,6 +291,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -244,19 +315,21 @@ class device_kernel_inplace_matrix_add { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -277,6 +350,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -298,19 +374,21 @@ class device_kernel_inplace_matrix_scale { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp index 65587ddaa..6e1c99e65 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp @@ -13,9 +13,11 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -27,19 +29,23 @@ namespace plssvm::sycl::detail::basic { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -48,19 +54,19 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -69,25 +75,50 @@ class device_kernel_assembly { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows - if (i >= j) { - // create a work-item private array used for internal caching + // only calculate the upper triangular matrix + if (i_idx >= j_idx) { + // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - temp[internal_i][internal_j] += detail::feature_reduce(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], - data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + } } } } @@ -95,22 +126,23 @@ class device_kernel_assembly { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } @@ -119,11 +151,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp index de6358ec8..f1c3e8945 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -41,11 +48,11 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -59,36 +66,15 @@ class device_kernel_symm { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; + // create two local memory arrays used for caching + real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private temp matrix to zero group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -97,37 +83,44 @@ class device_kernel_symm { } }); - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(FEATURE_BLOCK_SIZE)) { + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory // determine on which side of the diagonal we are located - if (dim + threadIdx_x < global_j) { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; - } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; + if (dim_block + threadIdx_x < global_j_idx_linear) { + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } else { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); @@ -135,13 +128,28 @@ class device_kernel_symm { // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } @@ -152,17 +160,31 @@ class device_kernel_symm { // apply the (partial) BLAS operation and update C group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -173,8 +195,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -189,16 +211,21 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -207,12 +234,12 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -226,36 +253,15 @@ class device_kernel_symm_mirror { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; + // create two local memory arrays used for caching + real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private temp matrix to zero group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices and diagonal condition - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; @@ -263,42 +269,67 @@ class device_kernel_symm_mirror { } }); - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(FEATURE_BLOCK_SIZE)) { - // load data into shared memory + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // store the values in the local memory + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); // implicit barrier - // perform the feature reduction calculation + // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } @@ -309,17 +340,31 @@ class device_kernel_symm_mirror { // apply the (remaining) BLAS operation and update C group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto partial_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -331,8 +376,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -349,6 +394,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -371,25 +419,27 @@ class device_kernel_inplace_matrix_add { void operator()(::sycl::group<2> group) const { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(0); - const std::size_t threadIdx_y = idx.get_local_id(1); - const std::size_t blockDim_x = idx.get_local_range(0); - const std::size_t blockDim_y = idx.get_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - - for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { - for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; - - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } }); @@ -411,6 +461,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -433,25 +486,27 @@ class device_kernel_inplace_matrix_scale { void operator()(::sycl::group<2> group) const { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(0); - const std::size_t threadIdx_y = idx.get_local_id(1); - const std::size_t blockDim_x = idx.get_local_range(0); - const std::size_t blockDim_y = idx.get_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - - for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { - for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; - - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } }); diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp index b09fef0f8..e6afac623 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp @@ -13,12 +13,15 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item +#include // std::array #include // std::size_t #include // std::tuple, std::make_tuple @@ -27,19 +30,23 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -48,19 +55,19 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -68,67 +75,56 @@ class device_kernel_assembly { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; - - ::sycl::private_memory temp{ group }; - - // initialize private and local variables - group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // create two local memory arrays used for caching + real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + + // create a private memory array used for internal caching + ::sycl::private_memory, INTERNAL_BLOCK_SIZE>, 2> temp{ group }; + // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further + if (group[1] >= group[0]) { // initialize private temp matrix to zero - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; + } } - } - }); - - // implicit group barrier + }); - // exploit symmetry - if (group[1] >= group[0]) { - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { - // load data into shared memory + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } }); @@ -136,14 +132,30 @@ class device_kernel_assembly { // perform the feature reduction calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -154,26 +166,40 @@ class device_kernel_assembly { // apply the remaining part of the kernel function and store the value in the output kernel matrix group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp(idx)[internal_i][internal_j]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } @@ -183,11 +209,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp index 9e8500d73..1334e566d 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -41,11 +48,11 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -62,92 +69,111 @@ class device_kernel_symm { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // the indices used in the current work-item + ::sycl::require_local_mem(), // A_cache + ::sycl::require_local_mem(), // B_cache - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast(FEATURE_BLOCK_SIZE)) { - // load data into shared memory + // create two local memory arrays used for caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &A_cache, auto &B_cache, auto &temp) { + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory // determine on which side of the diagonal we are located - if (dim + threadIdx_x < global_j) { - A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; + if (dim_block + threadIdx_x < global_j_idx_linear) { + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } else { - A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; - } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { - A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only } - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); - // perform calculations + // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } }); } + // apply the (partial) BLAS operation and update C ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -159,8 +185,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -175,16 +201,21 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -193,12 +224,12 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -215,83 +246,105 @@ class device_kernel_symm_mirror { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // the indices used in the current work-item + ::sycl::require_local_mem(), // A_cache + ::sycl::require_local_mem(), // B_cache - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast(FEATURE_BLOCK_SIZE)) { - // load data into shared memory + // create a private memory array used for internal caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &A_cache, auto &B_cache, auto &temp) { + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // store the values in the local memory + A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } }); - // perform calculations + // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } } } } }); } + // apply the (remaining) BLAS operation and update C ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i(idx) + static_cast(internal_i); - const auto partial_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -304,8 +357,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -322,6 +375,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -346,28 +402,29 @@ class device_kernel_inplace_matrix_add { void operator()(T group) const { ::sycl::memory_environment(group, [&]() { - // scale ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(group, 0); - const std::size_t threadIdx_y = idx.get_local_id(group, 1); - const std::size_t blockDim_x = group.get_logical_local_range(0); - const std::size_t blockDim_y = group.get_logical_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } }); @@ -390,6 +447,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -414,28 +474,29 @@ class device_kernel_inplace_matrix_scale { void operator()(T group) const { ::sycl::memory_environment(group, [&]() { - // scale ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = idx.get_local_id(group, 0); - const std::size_t threadIdx_y = idx.get_local_id(group, 1); - const std::size_t blockDim_x = group.get_logical_local_range(0); - const std::size_t blockDim_y = group.get_logical_local_range(1); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; - const std::size_t blockIdx_y = group[1] + grid_y_offset_; - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // indices - const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) { - for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) { - const std::size_t global_i = i + internal_i; - const std::size_t global_j = j + internal_j; + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } }); diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp index 4ed3764ce..c2fcc5df6 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp @@ -13,12 +13,15 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item +#include // std::array #include // std::size_t #include // std::tuple, std::make_tuple @@ -27,19 +30,23 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -48,19 +55,19 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -71,94 +78,118 @@ class device_kernel_assembly { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); - - // exploit symmetry + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // data_i_cache + ::sycl::require_local_mem(), // data_j_cache + + // create a private memory array used for internal caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), // temp + [&](auto &data_i_cache, auto &data_j_cache, auto &temp) { + // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { - // load data into shared memory + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } }); - // perform calculations + // perform the feature reduction calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } }); } + // apply the remaining part of the kernel function and store the value in the output kernel matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_j = j(idx) + static_cast(internal_j); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp(idx)[internal_i][internal_j]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } @@ -169,11 +200,11 @@ class device_kernel_assembly { private: /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp index ae07f7ec6..b179cbabe 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp @@ -13,7 +13,9 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -24,16 +26,21 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -42,13 +49,13 @@ class device_kernel_symm { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - A_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - B_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + A_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + B_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, num_rows_{ num_rows }, num_rhs_{ num_rhs }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -67,72 +74,85 @@ class device_kernel_symm { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_uz) { - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows + + // iterate over all values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + // determine on which side of the diagonal we are located + if (dim_block + threadIdx_x < global_j_idx_linear) { + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } else { + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }]; // SoA, upper triangular matrix only + } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x < global_j) { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; + B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA } - // determine on which side of the diagonal we are located - if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }]; + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp[internal_i][internal_j] += sum; + } + } } else { - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }]; - } - - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data - - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current thread + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the (partial) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -147,8 +167,8 @@ class device_kernel_symm { /// @cond Doxygen_suppress const std::size_t num_rows_; const std::size_t num_rhs_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -163,17 +183,22 @@ class device_kernel_symm { * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! * Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_symm_mirror { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory * @param[in] num_rows the number of rows in @p A and @p C * @param[in] num_rhs the number of columns in @p B and @p C * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for * @param[in] alpha the scalar alpha value * @param[in] A the matrix @p A * @param[in] B the matrix @p B @@ -182,14 +207,14 @@ class device_kernel_symm_mirror { * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - A_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - B_cache_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + A_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + B_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, num_rows_{ num_rows }, num_rhs_{ num_rhs }, num_mirror_rows_{ num_mirror_rows }, - device_specific_num_rows_{ device_specific_num_rows }, - row_offset_{ row_offset }, + device_num_rows_{ device_num_rows }, + device_row_offset_{ device_row_offset }, alpha_{ alpha }, A_{ A }, B_{ B }, @@ -208,63 +233,79 @@ class device_kernel_symm_mirror { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over the remaining features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += FEATURE_BLOCK_SIZE_uz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j]; - A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j]; - - B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i]; - } - nd_idx.barrier(); // wait until all threads loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rhs + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_mirror_rows + + // iterate over the remaining values using blocking to be able to cache them for faster memory accesses + for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear]; // SoA, upper triangular matrix only + B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + sum += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all threads performed their part of the calculations } + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rhs + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_mirror_rows + // apply the (remaining) BLAS operation and update C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto partial_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) { - C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i]; + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) { + C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx]; // SoA } } } @@ -280,8 +321,8 @@ class device_kernel_symm_mirror { const std::size_t num_rows_; const std::size_t num_rhs_; const std::size_t num_mirror_rows_; - const std::size_t device_specific_num_rows_; - const std::size_t row_offset_; + const std::size_t device_num_rows_; + const std::size_t device_row_offset_; const real_type alpha_; const real_type *A_; const real_type *B_; @@ -298,6 +339,9 @@ class device_kernel_symm_mirror { */ class device_kernel_inplace_matrix_add { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in both matrices @@ -319,25 +363,27 @@ class device_kernel_inplace_matrix_add { */ void operator()(::sycl::nd_item<2> nd_idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // # num_rows - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // # num_rhs + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j]; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx]; // SoA } } } @@ -358,6 +404,9 @@ class device_kernel_inplace_matrix_add { */ class device_kernel_inplace_matrix_scale { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] num_cols the number of columns in the matrix @@ -379,25 +428,27 @@ class device_kernel_inplace_matrix_scale { */ void operator()(::sycl::nd_item<2> nd_idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // # num_rows - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // # num_rhs + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_rhs for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = i + static_cast(internal_i); - const auto global_j = j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); - lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_; + lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_; // SoA } } } diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp index 96030fbe7..b4b836b14 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp @@ -13,9 +13,11 @@ #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -27,20 +29,24 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Create the explicit kernel matrix using the @p kernel_function. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_assembly { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[out] kernel_matrix_d the calculated kernel matrix - * @param[in] data_d the data points to calculate the kernel matrix from + * @param[out] kernel_matrix the calculated kernel matrix + * @param[in] data the data points to calculate the kernel matrix from * @param[in] num_rows the number of data points * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data_d the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] q the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction @@ -49,21 +55,21 @@ class device_kernel_assembly { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly(::sycl::handler &cgh, real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_i_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_j_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - kernel_matrix_d_{ kernel_matrix_d }, - data_d_{ data_d }, + device_kernel_assembly(::sycl::handler &cgh, real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + data_i_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + data_j_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + kernel_matrix_{ kernel_matrix }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, q_{ q }, QA_cost_{ QA_cost }, cost_{ cost }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** @@ -76,74 +82,92 @@ class device_kernel_assembly { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_y >= blockIdx_x) { - // create a work-item private array used for internal caching + // create a private memory array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data - - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } + // calculate the indices used in the current work-item + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const auto device_global_i = i + static_cast(internal_i); - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto device_global_j = j + static_cast(internal_j); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { real_type temp_ij = temp[internal_i][internal_j]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp_ij += cost_; } - // update the kernel matrix - kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij; + // update the upper triangular kernel matrix + kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } } } @@ -152,16 +176,16 @@ class device_kernel_assembly { private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_i_; + ::sycl::local_accessor data_i_cache_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_j_; + ::sycl::local_accessor data_j_cache_; /// @cond Doxygen_suppress - real_type *kernel_matrix_d_; - const real_type *data_d_; + real_type *kernel_matrix_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type *q_; const real_type QA_cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp index 7b517a7b1..1a8c71c1d 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp @@ -13,10 +13,12 @@ #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::basic { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic; + /** * @brief Initialize the SYCL kernel function object. * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -52,13 +58,13 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -67,7 +73,7 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -75,28 +81,53 @@ class device_kernel_assembly_symm { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows // only calculate the upper triangular matrix - if (i >= j) { + if (i_idx >= j_idx) { // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - - temp[internal_i][internal_j] += detail::feature_reduce(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i], - data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]); + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += detail::feature_reduce(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + } } } } @@ -104,28 +135,48 @@ class device_kernel_assembly_symm { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - real_type temp_ij = temp[internal_i][internal_j]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { - temp_ij += cost_; - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) { - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + if (global_i_idx == global_j_idx) { + temp[internal_i][internal_j] += cost_; + } + } else { + // be sure to set the value to zero otherwise + temp[internal_i][internal_j] = real_type{ 0.0 }; + } + } + } + + //*************************************************************************// + // calculate C += alpha * temp * B // + //*************************************************************************// + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); + + if (global_i_idx == global_j_idx) { + // only apply once to the diagonal + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx]; } } else { - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) { - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + // apply it for the upper and lower triangular matrix + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx]; // symmetry - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx]; + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx]; } } } @@ -137,11 +188,12 @@ class device_kernel_assembly_symm { private: /// @cond Doxygen_suppress const real_type alpha_; + const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp index 1a24024b6..08ed85c0c 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp @@ -13,10 +13,12 @@ #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::hierarchical { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical; + /** * @brief Initialize the SYCL kernel function object. * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -52,13 +58,13 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -67,37 +73,45 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // calculate the indices used in the current work-item - ::sycl::private_memory i{ group }; - ::sycl::private_memory i_linear{ group }; - ::sycl::private_memory j{ group }; - ::sycl::private_memory j_linear{ group }; + // the indices used in the current work-item + ::sycl::private_memory i_idx{ group }; // num_rows - device_row_offset + ::sycl::private_memory j_idx{ group }; // device_num_rows + + ::sycl::private_memory i_idx_linear{ group }; // num_rows - device_row_offset + ::sycl::private_memory j_idx_linear{ group }; // device_num_rows + + // create two local memory arrays used for caching + real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; // initialize private and local variables group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + i_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + j_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + i_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { @@ -111,32 +125,36 @@ class device_kernel_assembly_symm { // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { - // allocate shared memory - real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto data_i_cache = reinterpret_cast(cache_one); + auto data_j_cache = reinterpret_cast(cache_two); // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } }); @@ -144,14 +162,30 @@ class device_kernel_assembly_symm { // perform the feature reduction calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -165,16 +199,18 @@ class device_kernel_assembly_symm { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - const auto device_global_j = j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx(idx) + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx(idx) + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] += cost_; } } else { @@ -187,47 +223,51 @@ class device_kernel_assembly_symm { // implicit group barrier - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { - // allocate shared memory - real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; - real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]; + // reinterpret the local memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; - C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 }; - C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the local memory + B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA + C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 }; // SoA } }); // implicit group barrier - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + // calculate intermediate results and store them in local memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += - temp(idx)[internal_i][internal_j] * B_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_1) % THREAD_BLOCK_SIZE] += + temp(idx)[internal_i][internal_j] * B_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_1) % THREAD_BLOCK_SIZE]; } } }); @@ -235,20 +275,22 @@ class device_kernel_assembly_symm { // implicit group barrier } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_y = idx.get_local_id(1); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j(idx) + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1]; - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1 + THREAD_BLOCK_SIZE]; + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1]; // SoA } }); @@ -260,10 +302,11 @@ class device_kernel_assembly_symm { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal_j); - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; } } @@ -272,47 +315,51 @@ class device_kernel_assembly_symm { // implicit group barrier - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // allocate shared memory - real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto B_cache = reinterpret_cast(cache_one); + auto C_out_cache = reinterpret_cast(cache_two); // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + // store the values in the local memory + B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - C_out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } }); // implicit group barrier - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + // calculate intermediate results and store them in local memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_i][internal_j] * B_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_i][internal_j] * B_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; } } }); @@ -320,20 +367,22 @@ class device_kernel_assembly_symm { // implicit group barrier } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal); - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; // SoA } }); @@ -347,10 +396,10 @@ class device_kernel_assembly_symm { /// @cond Doxygen_suppress const real_type alpha_; const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp index 4391f2f19..d7593084b 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp @@ -13,10 +13,12 @@ #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::scoped { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped; + /** * @brief Initialize the SYCL kernel function object. * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -52,13 +58,13 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -67,7 +73,7 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -77,88 +83,124 @@ class device_kernel_assembly_symm { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), + // the indices used in the current work-item + ::sycl::require_private_mem(), // num_rows - device_row_offset + ::sycl::require_private_mem(), // device_num_rows + + ::sycl::require_private_mem(), // num_rows - device_row_offset + ::sycl::require_private_mem(), // device_num_rows + + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // cache_one + ::sycl::require_local_mem(), // cache_two + + // create a private memory array used for internal caching ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) { + [&](auto &i_idx, auto &j_idx, auto &i_idx_linear, auto &j_idx_linear, auto &cache_one, auto &cache_two, auto &temp) { // initialize private and local variables ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices to access the global data + i_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; + j_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + i_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + j_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; }); - // exploit symmetry + // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (group[1] >= group[0]) { - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { - // load data into local memory - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - - const std::size_t threadIdx_x = idx.get_local_id(group, 0); - - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - } - }); - - // perform the feature reduction calculation - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// + { + // rename cached arrays + auto &data_i_cache = cache_one; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &data_j_cache = cache_two; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { + // load data into local memory + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA + } + }); + + // perform the feature reduction calculation + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp(idx)[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp(idx)[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + } } } - } - }); + }); + } } // apply the remaining part of the kernel function and store the value in the output kernel matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto device_global_i = i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); - const auto device_global_j = j(idx) + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx(idx) + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx(idx) + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp(idx)[internal_i][internal_j] = detail::apply_kernel_function(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] += cost_; } } else { @@ -169,64 +211,70 @@ class device_kernel_assembly_symm { } }); - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { // rename cached arrays - auto &B_cache = data_cache_i; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] - auto &C_out_cache = data_cache_j; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] + auto &B_cache = cache_one; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the local memory + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; // SoA } }); - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + // calculate intermediate results and store them in local memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += - temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE] += + temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE]; } } }); } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_y = idx.get_local_id(group, 1); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j(idx) + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE]; + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; // SoA } }); } @@ -236,82 +284,83 @@ class device_kernel_assembly_symm { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal_i); - const auto global_j = row_offset_ + j(idx) + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast(internal_j); - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; } } } }); - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { - // allocate shared memory - auto &B_cache = data_cache_i; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + // rename local memory + auto &B_cache = cache_one; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + // store the values in the local memory + B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } }); - // implicit group barrier - - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + // calculate intermediate results and store them in local memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; } } }); - - // implicit group barrier } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i(idx) + static_cast(internal); - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast(internal); + + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; // SoA } }); - - // implicit group barrier } } } @@ -322,10 +371,10 @@ class device_kernel_assembly_symm { /// @cond Doxygen_suppress const real_type alpha_; const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp index 34b55fff4..015268fa2 100644 --- a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp @@ -13,10 +13,12 @@ #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -28,21 +30,25 @@ namespace plssvm::sycl::detail::work_group { /** * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function */ -template +template class device_kernel_assembly_symm { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory * @param[in] alpha the scalar alpha value * @param[in] q the vector used in the dimensional reduction - * @param[in] data_d the data points to calculate the implicit kernel matrix from + * @param[in] data the data points to calculate the implicit kernel matrix from * @param[in] num_rows the total number of data points (= total number of rows) * @param[in] device_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data_d the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for * @param[in] num_features the number of features per data point * @param[in] QA_cost the scalar used in the dimensional reduction * @param[in] cost the cost factor the diagonal is scaled with @@ -53,15 +59,15 @@ class device_kernel_assembly_symm { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_i_{ ::sycl::range<1>{ static_cast(FEATURE_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - data_cache_j_{ ::sycl::range<1>{ static_cast(FEATURE_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + cache_one_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + cache_two_{ ::sycl::range<1>{ static_cast(THREAD_BLOCK_SIZE) * static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] alpha_{ alpha }, q_{ q }, - data_d_{ data_d }, + data_{ data }, num_rows_{ num_rows }, device_num_rows_{ device_num_rows }, - row_offset_{ row_offset }, + device_row_offset_{ device_row_offset }, num_features_{ num_features }, QA_cost_{ QA_cost }, cost_{ cost }, @@ -70,7 +76,7 @@ class device_kernel_assembly_symm { num_classes_{ num_classes }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -82,50 +88,72 @@ class device_kernel_assembly_symm { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // calculate the indices used in the current work-item - const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_rows - device_row_offset + const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // device_num_rows + + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_rows - device_row_offset + const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // device_num_rows // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further if (blockIdx_y >= blockIdx_x) { // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// { + // rename cached arrays + auto &data_i_cache = cache_one_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &data_j_cache = cache_two_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_i_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_i_[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i]; - data_cache_j_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; - data_cache_j_[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + data_i_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear]; // SoA + data_j_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear]; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - temp[internal_i][internal_j] += detail::feature_reduce(data_cache_i_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], - data_cache_j_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + temp[internal_i][internal_j] += detail::feature_reduce(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i], + data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]); + } } } } @@ -136,16 +164,18 @@ class device_kernel_assembly_symm { // apply the remaining part of the kernel function and store the value in the output kernel matrix for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto device_global_i = i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); - const auto device_global_j = j + static_cast(internal_j); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) { - temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j]; + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset_ + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset_ + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx]; // apply the cost on the diagonal - if (global_i == global_j) { + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] += cost_; } } else { @@ -155,42 +185,44 @@ class device_kernel_assembly_symm { } } - // calculate C += alpha * temp * B for the UPPER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the UPPER triangular matrix // + //*************************************************************************// { // rename cached arrays - auto &B_cache = data_cache_i_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] - auto &C_out_cache = data_cache_j_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE] + auto &B_cache = cache_one_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two_; // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_i = row_offset_ + i_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; - C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 }; + // store the values in the local memory + B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA + C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 }; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + // calculate intermediate results and store them in local memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] += - temp[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE]; + C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE] += + temp[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j + static_cast(internal); - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE]; + // calculate the indices to access the global data + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal); + + detail::atomic_op{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1]; // SoA } nd_idx.barrier(); // wai until all work-items updated C with their values } @@ -199,51 +231,55 @@ class device_kernel_assembly_symm { // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const auto global_i = row_offset_ + i + static_cast(internal_i); - const auto global_j = row_offset_ + j + static_cast(internal_j); + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset_ + j_idx + static_cast(internal_j); - if (global_i == global_j) { + // update the diagonal + if (global_i_idx == global_j_idx) { temp[internal_i][internal_j] = real_type{ 0.0 }; } } } - // calculate C += alpha * temp * B for the LOWER triangular matrix + //*************************************************************************// + // calculate C += alpha * temp * B for the LOWER triangular matrix // + //*************************************************************************// { // rename cached arrays - auto &B_cache = data_cache_i_; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] - auto &C_out_cache = data_cache_j_; // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &B_cache = cache_one_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] + auto &C_out_cache = cache_two_; // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] // iterate over all classes using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_j = row_offset_ + j_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x]; - B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + // store the values in the local memory + B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x]; // SoA C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // calculate intermediate results and store them in shared memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + // calculate intermediate results and store them in local memory + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += - temp[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; + C_out_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] += + temp[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to C + // atomically add the intermediate cached results to the C matrix for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_i = row_offset_ + i + static_cast(internal); - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset_ + i_idx + static_cast(internal); + + detail::atomic_op{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1]; // SoA } nd_idx.barrier(); // wait until all threads updated C with their values } @@ -253,17 +289,17 @@ class device_kernel_assembly_symm { private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_i_; + ::sycl::local_accessor cache_one_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_j_; + ::sycl::local_accessor cache_two_; /// @cond Doxygen_suppress const real_type alpha_; const real_type *q_; - const real_type *data_d_; + const real_type *data_; const std::size_t num_rows_; const std::size_t device_num_rows_; - const std::size_t row_offset_; + const std::size_t device_row_offset_; const std::size_t num_features_; const real_type QA_cost_; const real_type cost_; diff --git a/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp b/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp index 97c5c6248..6cfa159bc 100644 --- a/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp @@ -30,42 +30,17 @@ namespace plssvm::sycl::detail { /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] inline real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp index c16965cb1..9e838a89c 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp @@ -13,10 +13,12 @@ #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_BASIC_PREDICT_KERNEL_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::item @@ -26,31 +28,36 @@ namespace plssvm::sycl::detail::basic { /** - * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. + * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic; + /** * @brief Initialize the SYCL kernel function object. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -60,77 +67,106 @@ class device_kernel_w_linear { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_classes // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; ++sv) { - // perform the dot product calculation + // iterate over all support vectors using blocking + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_class_idx = class_idx + static_cast(internal_class); + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + + real_type sum{ 0.0 }; + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + sum += alpha_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv_block + sv + device_sv_offset_] * // AoS + support_vectors_[global_feature_idx * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + sv]; // SoA + } + temp[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_class_idx = class_idx + static_cast(internal_class); + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + + temp[internal_feature][internal_class] += alpha_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv_block + sv + device_sv_offset_] * // AoS + support_vectors_[global_feature_idx * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + sv]; // SoA + } + } + } + } + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - temp[internal_feature][internal_class] += alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_] * sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } - - // update global array with local one - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx + static_cast(internal_class); - const auto global_feature_idx = feature_idx + static_cast(internal_feature); - - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; - } - } } private: /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -143,46 +179,70 @@ class device_kernel_predict_linear { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_classes // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the dot product calculation - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_class_idx = class_idx + static_cast(internal_class); - - temp[internal_pd][internal_class] += w_d_[dim * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += w_[(feature_block + feature) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; // SoA + } + temp[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + temp[internal_pp][internal_class] += w_[(feature_block + feature) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; // SoA + } + } } } } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); const auto global_class_idx = class_idx + static_cast(internal_class); - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } } private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -192,21 +252,25 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses SYCL's basic data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic; + /** * @brief Initialize the SYCL kernel function object. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -215,19 +279,19 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -235,54 +299,83 @@ class device_kernel_predict { */ void operator()(::sycl::item<2> idx) const { // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); // calculate the indices used in the current work-item - const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz; + const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz; // num_support_vectors // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - const auto global_sv_idx = sv_idx + static_cast(internal_sv); - - temp[internal_pd][internal_sv] += detail::feature_reduce(sv_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx], - predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]); + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(support_vectors_[(feature_block + feature) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx], // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]); // SoA + } + temp[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); + + temp[internal_pp][internal_sv] += detail::feature_reduce(support_vectors_[(feature_block + feature) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx], // SoA + predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]); // SoA + } + } } } } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter_); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter_); } } - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; ++dim) { + // iterate over all classes using blocking + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { if (sv_idx == 0) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += -rho_d_[dim]; + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + // calculate the index to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += -rho_[class_block + class_idx]; + } } } - // calculate intermediate results and store them in local memory - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // atomically add the results to the prediction + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const auto global_pp_idx = pp_idx + static_cast(internal_pd); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); const auto global_sv_idx = sv_idx + static_cast(internal_sv); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += - temp[internal_pd][internal_sv] * alpha_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += + temp[internal_pp][internal_sv] * alpha_[(class_block + class_idx) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + } } } } @@ -290,11 +383,11 @@ class device_kernel_predict { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp index 4098c4914..ea8bd5b6e 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp @@ -13,10 +13,12 @@ #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_HIERARCHICAL_PREDICT_KERNEL_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::group, sycl::private_memory, sycl::h_item @@ -26,31 +28,36 @@ namespace plssvm::sycl::detail::hierarchical { /** - * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. + * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical; + /** * @brief Initialize the SYCL kernel function object. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -59,36 +66,15 @@ class device_kernel_w_linear { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory feature_idx{ group }; - ::sycl::private_memory feature_idx_linear{ group }; - ::sycl::private_memory class_idx{ group }; - ::sycl::private_memory class_idx_linear{ group }; + // create two local memory arrays used for caching + real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private temp matrix to zero group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { temp(idx)[internal_i][internal_j] = real_type{ 0.0 }; @@ -99,23 +85,36 @@ class device_kernel_w_linear { // implicit group barrier // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_features + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_feature_idx = feature_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA - data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS + feature_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x]; // SoA + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x]; // AoS } }); @@ -123,13 +122,28 @@ class device_kernel_w_linear { // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + real_type sum{ 0.0 }; + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + temp(idx)[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_feature][internal_class] += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } } @@ -138,16 +152,30 @@ class device_kernel_w_linear { // implicit group barrier } - // update global array with local one + // update the global w-vector with the locally cached values group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_feature_idx = feature_idx(idx) + static_cast(internal_feature); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; // SoA } } }); @@ -155,41 +183,46 @@ class device_kernel_w_linear { private: /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -201,35 +234,15 @@ class device_kernel_predict_linear { * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory pp_idx{ group }; - ::sycl::private_memory pp_idx_linear{ group }; - ::sycl::private_memory class_idx{ group }; - ::sycl::private_memory class_idx_linear{ group }; + // create two local memory arrays used for caching + real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private variable group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -240,27 +253,38 @@ class device_kernel_predict_linear { // implicit group barrier - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - // load data into shared memory + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes + + // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA } }); @@ -268,13 +292,28 @@ class device_kernel_predict_linear { // perform the dot product calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + temp(idx)[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_pp][internal_class] += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } } @@ -283,16 +322,30 @@ class device_kernel_predict_linear { // implicit group barrier } - // update global array with local one + // update the global array with the local one group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_pp_idx = pp_idx(idx) + static_cast(internal_pd); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } }); @@ -300,10 +353,10 @@ class device_kernel_predict_linear { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -313,21 +366,25 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses SYCL's hierarchical data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical; + /** * @brief Initialize the SYCL kernel function object. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -336,51 +393,34 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. * @param[in] group indices representing the current point in the execution space */ void operator()(::sycl::group<2> group) const { - // allocate shared memory - real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; - - // calculate the indices used in the current work-item - ::sycl::private_memory pp_idx{ group }; - ::sycl::private_memory pp_idx_linear{ group }; - ::sycl::private_memory sv_idx_linear{ group }; + // create two local memory arrays used for caching + real_type cache_one[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + real_type cache_two[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]; + // create a private memory array used for internal caching ::sycl::private_memory temp{ group }; - // initialize private and local variables + // initialize private variable group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - const std::size_t threadIdx_y = idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - // initialize private temp matrix to zero for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { @@ -392,27 +432,42 @@ class device_kernel_predict { // implicit group barrier { + // rename cached arrays -> not possible due to an AdaptiveCpp runtime exception + // auto &pp_cache = cache_one; + // auto &sv_cache = cache_two; + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + cache_one[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; + cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; } }); @@ -420,14 +475,30 @@ class device_kernel_predict { // perform the feature reduction calculation group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(cache_two[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + cache_one[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + temp(idx)[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp(idx)[internal_pp][internal_sv] += detail::feature_reduce(cache_two[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + cache_one[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } } } } @@ -439,9 +510,9 @@ class device_kernel_predict { // update temp using the respective kernel function group.parallel_for_work_item([&](::sycl::h_item<2> idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_); + temp(idx)[internal_pp][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pp][internal_sv], kernel_function_parameter_); } } }); @@ -449,36 +520,42 @@ class device_kernel_predict { // implicit group barrier { - // rename cached arrays -> can't rename the arrays due to AdaptiveCpp runtime exception - // auto &alpha_cache = data_cache_pp; - // auto &out_cache = data_cache_sv; + // rename cached arrays -> not possible due to an AdaptiveCpp runtime exception + // auto &alpha_cache = cache_one; + // auto &out_cache = cache_two; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory + cache_one[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x]; } else { - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; + cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } }); @@ -486,15 +563,16 @@ class device_kernel_predict { // implicit group barrier // calculate intermediate results and store them in local memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - data_cache_sv[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + cache_two[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pp][internal_sv] * cache_one[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } }); @@ -502,21 +580,29 @@ class device_kernel_predict { // implicit group barrier } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction group.parallel_for_work_item([&](::sycl::h_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(0)); const auto local_id_1 = static_cast(idx.get_local_id(1)); - const std::size_t threadIdx_x = idx.get_local_id(0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_y = static_cast(idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx(idx) + static_cast(internal); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; } }); @@ -527,11 +613,11 @@ class device_kernel_predict { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp index 1a42161f5..e26025670 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp @@ -13,10 +13,12 @@ #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_SCOPED_PREDICT_KERNEL_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item @@ -26,31 +28,36 @@ namespace plssvm::sycl::detail::scoped { /** - * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. + * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped; + /** * @brief Initialize the SYCL kernel function object. - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -62,78 +69,101 @@ class device_kernel_w_linear { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_feature, auto &data_cache_alpha, auto &feature_idx, auto &feature_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // feature_cache + ::sycl::require_local_mem(), // alpha_cache + // create a private memory array used for internal caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &feature_cache, auto &alpha_cache, auto &temp) { // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_features + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_feature_idx = feature_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA - data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS + feature_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x]; // SoA + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x]; // AoS } }); // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + real_type sum{ 0.0 }; + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + temp(idx)[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_feature][internal_class] += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } } } } }); } - // update global array with local one + // update the global w-vector with the locally cached values ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current thread + const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_feature_idx = feature_idx(idx) + static_cast(internal_feature); + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class]; // SoA } } }); @@ -142,41 +172,46 @@ class device_kernel_w_linear { private: /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped; + /** * @brief Initialize the SYCL kernel function object. - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -191,81 +226,102 @@ class device_kernel_predict_linear { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_pp, auto &data_cache_w, auto &pp_idx, auto &pp_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_x = idx.get_local_id(group, 0); // current thread in block x-dimension - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // pp_cache + ::sycl::require_local_mem(), // w_cache - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // create a private memory array used for internal caching + ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), + [&](auto &pp_cache, auto &w_cache, auto &temp) { + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes - // load data into shared memory + // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_class_idx = class_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA } }); // perform the dot product calculation ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + temp(idx)[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp(idx)[internal_pp][internal_class] += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } } } } }); } - // update global array with local one + // update the global array with the local one ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large + + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx(idx) + static_cast(internal_class); - const auto global_pp_idx = pp_idx(idx) + static_cast(internal_pd); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } }); @@ -274,10 +330,10 @@ class device_kernel_predict_linear { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -287,21 +343,25 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses AdaptiveCpp's scoped parallelism. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped; + /** * @brief Initialize the SYCL kernel function object. - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -310,19 +370,19 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -332,143 +392,175 @@ class device_kernel_predict { template void operator()(T group) const { ::sycl::memory_environment(group, - ::sycl::require_local_mem(), - ::sycl::require_local_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), - ::sycl::require_private_mem(), + // create two local memory arrays used for caching + ::sycl::require_local_mem(), // cache_one + ::sycl::require_local_mem(), // cache_two + + // create a private memory array used for internal caching ::sycl::require_private_mem, INTERNAL_BLOCK_SIZE>>({}), - [&](auto &data_cache_pp, auto &data_cache_sv, auto &pp_idx, auto &pp_idx_linear, auto &sv_idx_linear, auto &temp) { - // initialize private and local variables - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const std::size_t threadIdx_y = idx.get_local_id(group, 1); // current thread in block y-dimension - const std::size_t blockDim_x = group.get_logical_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = group.get_logical_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = group[1] + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - - // indices - pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - }); + [&](auto &cache_one, auto &cache_two, auto &temp) { + { + // rename cached arrays + auto &pp_cache = cache_one; + auto &sv_cache = cache_two; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += static_cast(FEATURE_BLOCK_SIZE)) { - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast(THREAD_BLOCK_SIZE)) { + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - } - }); + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors - // perform the feature reduction calculation - ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); - const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; + sv_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; + } + }); + + // perform the feature reduction calculation + ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions + const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); + const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + temp(idx)[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp(idx)[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + } } } - } - }); + }); + } } // update temp using the respective kernel function ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_); + temp(idx)[internal_pp][internal_sv] = detail::apply_kernel_function(temp(idx)[internal_pp][internal_sv], kernel_function_parameter_); } } }); { // rename cached arrays - auto &alpha_cache = data_cache_pp; - auto &out_cache = data_cache_sv; + auto &alpha_cache = cache_one; + auto &out_cache = cache_two; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += static_cast(FEATURE_BLOCK_SIZE)) { + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast(THREAD_BLOCK_SIZE)) { // load data into local memory ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t blockIdx_x = group[0] + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(group.get_logical_local_range(0)); // number of work-items in work-group x-dimension + const auto blockIdx_x = static_cast(group[0]) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors - alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { - out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x]; } else { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } }); // calculate intermediate results and store them in local memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] += + temp(idx)[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } }); } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) { + // cast values to 32-bit unsigned int values to prevent implicit conversions const auto local_id_0 = static_cast(idx.get_local_id(group, 0)); const auto local_id_1 = static_cast(idx.get_local_id(group, 1)); - const std::size_t threadIdx_x = idx.get_local_id(group, 0); + // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + const auto threadIdx_x = static_cast(idx.get_local_id(group, 0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(idx.get_local_id(group, 1)); // current work-item in work-group y-dimension + const auto blockDim_y = static_cast(group.get_logical_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_y = static_cast(group[1]) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + // calculate the indices used in the current thread + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx(idx) + static_cast(internal); + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; } }); } @@ -478,11 +570,11 @@ class device_kernel_predict { private: /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp index d451ac7d5..bef23d533 100644 --- a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp +++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp @@ -13,10 +13,12 @@ #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_WORK_GROUP_PREDICT_KERNEL_HPP_ #pragma once +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp" // plssvm::sycl::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "sycl/sycl.hpp" // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor @@ -26,34 +28,39 @@ namespace plssvm::sycl::detail::work_group { /** - * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function. + * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_w_linear { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[in,out] w_d the vector to speedup the linear prediction - * @param[in] alpha_d the previously learned weights - * @param[in] sv_d the support vectors + * @param[in,out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_w_linear(::sycl::handler &cgh, real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - data_cache_feature_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_alpha_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - w_d_{ w_d }, - alpha_d_{ alpha_d }, - sv_d_{ sv_d }, + device_kernel_w_linear(::sycl::handler &cgh, real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + feature_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + alpha_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + w_{ w }, + alpha_{ alpha }, + support_vectors_{ support_vectors }, num_classes_{ num_classes }, num_sv_{ num_sv }, - device_specific_num_sv_{ device_specific_num_sv }, - sv_offset_{ sv_offset }, + device_num_sv_{ device_num_sv }, + device_sv_offset_{ device_sv_offset }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset } { } @@ -67,104 +74,130 @@ class device_kernel_w_linear { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) { - // load data into local memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_feature_idx = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + { + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_features + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes - data_cache_feature_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x]; // SoA - data_cache_alpha_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x]; // AoS - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data + // iterate over all support vectors using blocking to be able to cache them for faster memory accesses + for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_feature_idx_linear = feature_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_feature][internal_class] += data_cache_alpha_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + // store the values in the local memory + feature_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x]; // SoA + alpha_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x]; // AoS + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + real_type sum{ 0.0 }; + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + sum += alpha_cache_[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache_[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + temp[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_feature][internal_class] += alpha_cache_[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache_[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // update global array with local one + // calculate the indices used in the current work-item + const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_features + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global w-vector with the locally cached values for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const auto global_class_idx = class_idx + static_cast(internal_class); + // calculate the indices to access the global data const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; + w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; // SoA } } } private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_feature_; + ::sycl::local_accessor feature_cache_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_alpha_; + ::sycl::local_accessor alpha_cache_; /// @cond Doxygen_suppress - real_type *w_d_; - const real_type *alpha_d_; - const real_type *sv_d_; + real_type *w_; + const real_type *alpha_; + const real_type *support_vectors_; const std::size_t num_classes_; const std::size_t num_sv_; - const std::size_t device_specific_num_sv_; - const std::size_t sv_offset_; + const std::size_t device_num_sv_; + const std::size_t device_sv_offset_; const std::size_t grid_x_offset_; const std::size_t grid_y_offset_; /// @endcond }; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform */ +template class device_kernel_predict_linear { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[out] prediction_d the predicted values - * @param[in] w_d the vector to speedup the calculations - * @param[in] rho_d the previously learned bias - * @param[in] predict_points_d the data points to predict + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_predict_points the number of data points to predict * @param[in] num_features the number of features per data point * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used */ - device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : - data_cache_pp_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_w_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - prediction_d_{ prediction_d }, - w_d_{ w_d }, - rho_d_{ rho_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) : + pp_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + w_cache_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + prediction_{ prediction }, + w_{ w }, + rho_{ rho }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, @@ -181,74 +214,91 @@ class device_kernel_predict_linear { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; - const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; - // iterate over all support vectors using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { - // load data into shared memory - for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_class_idx = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory - data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_w_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - data_cache_w_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx]; - } - nd_idx.barrier(); // wait until all work-items loaded their part of the data + { + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_classes - // perform the dot product calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - temp[internal_pd][internal_class] += data_cache_w_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]; + // iterate over all features using blocking to be able to cache them for faster memory accesses + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { + // load data into local memory + for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_class_idx_linear = class_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + pp_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + w_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear]; // SoA + } + nd_idx.barrier(); // wait until all work-items loaded their part of the data + + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += w_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + temp[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + temp[internal_pp][internal_class] += w_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]; + } + } } } + nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // update global array with local one - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz; // num_classes + + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = pp_idx + static_cast(internal_pp); const auto global_class_idx = class_idx + static_cast(internal_class); - const auto global_pp_idx = pp_idx + static_cast(internal_pd); - prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx]; + prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx]; // AoS } } } private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_pp_; + ::sycl::local_accessor pp_cache_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_w_; + ::sycl::local_accessor w_cache_; /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *w_d_; - const real_type *rho_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *w_; + const real_type *rho_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_predict_points_; const std::size_t num_features_; @@ -258,22 +308,26 @@ class device_kernel_predict_linear { }; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. + * @brief Predict the @p predict_points using the @p kernel_function. * @details Uses SYCL's work-group data parallel kernels. + * @tparam target the target platform * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple` */ -template +template class device_kernel_predict { public: + /// The used SYCL data parallel kernel. + constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group; + /** * @brief Initialize the SYCL kernel function object. * @param[in] cgh the SYCL handler used to allocate the local memory - * @param[in] prediction_d the predicted values - * @param[in] alpha_d the previously learned weights - * @param[in] rho_d the previously learned biases - * @param[in] sv_d the support vectors - * @param[in] predict_points_d the data points to predict + * @param[in] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned biases + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict * @param[in] num_classes the number of classes * @param[in] num_sv the number of support vectors * @param[in] num_predict_points the number of data points to predict @@ -282,21 +336,21 @@ class device_kernel_predict { * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ - device_kernel_predict(::sycl::handler &cgh, real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : - data_cache_pp_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - data_cache_sv_{ ::sycl::range<2>{ static_cast(FEATURE_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, - prediction_d_{ prediction_d }, - alpha_d_{ alpha_d }, - rho_d_{ rho_d }, - sv_d_{ sv_d }, - predict_points_d_{ predict_points_d }, + device_kernel_predict(::sycl::handler &cgh, real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) : + cache_one_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + cache_two_{ ::sycl::range<2>{ static_cast(THREAD_BLOCK_SIZE), static_cast(INTERNAL_BLOCK_SIZE) * static_cast(THREAD_BLOCK_SIZE) }, cgh }, + prediction_{ prediction }, + alpha_{ alpha }, + rho_{ rho }, + support_vectors_{ support_vectors }, + predict_points_{ predict_points }, num_classes_{ num_classes }, num_sv_{ num_sv }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, grid_x_offset_{ grid_x_offset }, grid_y_offset_{ grid_y_offset }, - kernel_function_parameter_{ std::make_tuple(std::forward(kernel_function_parameter)...) } { } + kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { } /** * @brief Function call operator overload performing the actual calculation. @@ -308,47 +362,63 @@ class device_kernel_predict { const auto local_id_1 = static_cast(nd_idx.get_local_id(1)); // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows - const std::size_t threadIdx_x = nd_idx.get_local_id(0); // current thread in block x-dimension - const std::size_t threadIdx_y = nd_idx.get_local_id(1); // current thread in block y-dimension - const std::size_t blockDim_x = nd_idx.get_local_range(0); // number of threads in block x-dimension - const std::size_t blockDim_y = nd_idx.get_local_range(1); // number of threads in block y-dimension - const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_; // current block in grid x-dimension + offsets if the grid size would be too large - const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_; // current block in grid y-dimension + offsets if the grid size would be too large - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); - const auto FEATURE_BLOCK_SIZE_uz = static_cast(FEATURE_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - // calculate the indices used in the current work-item - const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; - const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; - const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; + const auto threadIdx_x = static_cast(nd_idx.get_local_id(0)); // current work-item in work-group x-dimension + const auto threadIdx_y = static_cast(nd_idx.get_local_id(1)); // current work-item in work-group y-dimension + const auto blockDim_x = static_cast(nd_idx.get_local_range(0)); // number of work-items in work-group x-dimension + const auto blockDim_y = static_cast(nd_idx.get_local_range(1)); // number of work-items in work-group y-dimension + const auto blockIdx_x = static_cast(nd_idx.get_group(0)) + grid_x_offset_; // current work-group in global range x-dimension + offsets if the global range is too large + const auto blockIdx_y = static_cast(nd_idx.get_group(1)) + grid_y_offset_; // current work-group in global range y-dimension + offsets if the global range is too large // create a work-item private array used for internal caching real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{}; { + // rename cached arrays + auto &pp_cache = cache_one_; + auto &sv_cache = cache_two_; + + // calculate the indices used in the current thread, pays attention to coalesced memory accesses + const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_predict_points + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors + // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) { + for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const auto global_pp_idx = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - const auto global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory - data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_pp_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]; - data_cache_sv_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - data_cache_sv_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_pp_idx_linear = pp_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + + // store the values in the local memory + pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear]; // SoA + sv_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // SoA } nd_idx.barrier(); // wait until all work-items loaded their part of the data - // perform the feature reduction calculation - for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] += detail::feature_reduce(data_cache_sv_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], - data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]); + real_type sum{ 0.0 }; + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + sum += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } + temp[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp[internal_pp][internal_sv] += detail::feature_reduce(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv], + pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]); + } } } } @@ -357,54 +427,57 @@ class device_kernel_predict { } // update temp using the respective kernel function - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pd][internal_sv] = detail::apply_kernel_function(temp[internal_pd][internal_sv], kernel_function_parameter_); + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter_); } } { // rename cached arrays - auto &alpha_cache = data_cache_pp_; - auto &out_cache = data_cache_sv_; + auto &alpha_cache = cache_one_; + auto &out_cache = cache_two_; - // iterate over all features using blocking to be able to cache them for faster memory accesses - for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) { + // calculate the indices used in the current work-item + const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + // calculate the indices used in the current work-item, pays attention to coalesced memory accesses + const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y; // num_support_vectors + + // iterate over all classes using blocking to be able to cache them for faster memory accesses + for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) { // load data into local memory for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { - const std::size_t global_sv_idx = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; - - alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; - alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx]; + // calculate the indices to access the global data, pays attention to coalesced memory accesses + const auto global_sv_idx_linear = sv_idx_linear + static_cast(internal) * THREAD_BLOCK_SIZE_uz; + // store the values in the local memory + alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear]; // AoS // the bias (rho) must only be applied once for all support vectors if (blockIdx_x == std::size_t{ 0 }) { - out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x]; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz]; + out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x]; } else { out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; - out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 }; } } nd_idx.barrier(); // wait until all work-items loaded their part of the data // calculate intermediate results and store them in local memory - for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) { - for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) { + for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] += - temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; + out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] += + temp[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv]; } } nd_idx.barrier(); // wait until all work-items performed their part of the calculations } - // add intermediate cached results to prediction_d + // atomically add the intermediate cached results to the prediction for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) { + // calculate the indices to access the global data const auto global_pp_idx = pp_idx + static_cast(internal); - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; - detail::atomic_op{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1]; + detail::atomic_op{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1]; } nd_idx.barrier(); // wait until all work-items updated their part of the prediction } @@ -413,16 +486,16 @@ class device_kernel_predict { private: /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_pp_; + ::sycl::local_accessor cache_one_; /// Local memory used for internal memory access optimizations. - ::sycl::local_accessor data_cache_sv_; + ::sycl::local_accessor cache_two_; /// @cond Doxygen_suppress - real_type *prediction_d_; - const real_type *alpha_d_; - const real_type *rho_d_; - const real_type *sv_d_; - const real_type *predict_points_d_; + real_type *prediction_; + const real_type *alpha_; + const real_type *rho_; + const real_type *support_vectors_; + const real_type *predict_points_; const std::size_t num_classes_; const std::size_t num_sv_; const std::size_t num_predict_points_; diff --git a/include/plssvm/backends/gpu_csvm.hpp b/include/plssvm/backends/gpu_csvm.hpp index 520f665ea..b05d21ab5 100644 --- a/include/plssvm/backends/gpu_csvm.hpp +++ b/include/plssvm/backends/gpu_csvm.hpp @@ -143,12 +143,13 @@ class gpu_csvm : virtual public ::plssvm::csvm { * @param[in] device_id the device to run the kernel on * @param[in] exec the execution range used in the device call * @param[in] params the parameters (e.g., kernel function) used to assemble the kernel matrix + * @param[in] use_usm_allocations if `true` use USM allocations for the `cg_streaming` implementation * @param[in] data_d the data set to create the kernel matrix from * @param[in] q_red_d the vector used in the dimensional reduction * @param[in] QA_cost the scalar used in the dimensional reduction * @return the explicit kernel matrix stored on the device (`[[nodiscard]]`) */ - [[nodiscard]] virtual device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const = 0; + [[nodiscard]] virtual device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const execution_range &exec, const parameter ¶ms, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const = 0; /** * @brief Perform an explicit BLAS level 3 operation: `C = alpha * A * B + beta * C` where @p A, @p B, and @p C are matrices, and @p alpha and @p beta are scalars. * @param[in] device_id the device to run the kernel on @@ -231,6 +232,7 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvmnum_available_devices(); const std::size_t num_rows_reduced = A.shape().x - 1; @@ -296,9 +298,10 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvmrun_assemble_kernel_matrix_explicit(device_id, exec, params, data_d[device_id], q_red_d[device_id], QA_cost); + device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, use_usm_allocations, data_d[device_id], q_red_d[device_id], QA_cost); kernel_matrices_parts[device_id] = ::plssvm::detail::move_only_any{ std::move(kernel_matrix) }; } break; @@ -389,6 +392,7 @@ void gpu_csvm::blas_level_3(const solver // unreachable break; case solver_type::cg_explicit: + case solver_type::cg_streaming: { const auto &A_d = detail::move_only_any_cast(A[device_id]); PLSSVM_ASSERT(!A_d.empty(), "The A matrix must not be empty!"); diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp index 78729691f..feb404b3d 100644 --- a/include/plssvm/backends/gpu_device_ptr.hpp +++ b/include/plssvm/backends/gpu_device_ptr.hpp @@ -19,8 +19,12 @@ #include "plssvm/matrix.hpp" // plssvm::layout_type, plssvm::matrix #include "plssvm/shape.hpp" // plssvm::shape -#include // std::size_t -#include // std::vector +#include "fmt/format.h" // fmt::format + +#include // std::min +#include // std::size_t +#include // std::swap, std::exchange +#include // std::vector namespace plssvm::detail { @@ -56,23 +60,26 @@ class gpu_device_ptr { * @brief Construct a device_ptr for the device managed by @p queue with the extents { @p size, 1 }. * @param[in] size the size of the managed memory * @param[in] queue the queue (or similar) to manage the device_ptr + * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend */ - gpu_device_ptr(size_type size, const queue_type queue); + gpu_device_ptr(size_type size, const queue_type queue, bool use_usm_allocations); /** * @brief Construct a device_ptr for the device managed by @p queue with the provided @p shape. * @details The managed memory size is: extents[0] * extents[1]. * @param[in] shape the 2D size of the managed memory; size = shape.x * shape.y * @param[in] queue the queue (or similar) to manage the device_ptr + * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend */ - gpu_device_ptr(plssvm::shape shape, const queue_type queue); + gpu_device_ptr(plssvm::shape shape, const queue_type queue, bool use_usm_allocations); /** * @brief Construct a device_ptr for the device managed by @p queue with the provided @p shape including @p padding. * @details The managed memory size is: (shape.x + padding.x) * (shape.y + padding.y). * @param[in] shape the extents of the managed memory * @param[in] padding the padding applied to the extents * @param[in] queue the queue (or similar) to manage the device_ptr + * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend */ - gpu_device_ptr(plssvm::shape shape, plssvm::shape padding, const queue_type queue); + gpu_device_ptr(plssvm::shape shape, plssvm::shape padding, const queue_type queue, bool use_usm_allocations); /** * @brief Delete copy-constructor to make device_ptr a move only type. @@ -228,7 +235,7 @@ class gpu_device_ptr { */ void fill(value_type value, size_type pos = 0); /** - * @brief Fill up-to @p count values to @p value starting at position @p pos. + * @brief Fill up-to @p count values of @p value starting at position @p pos. * @details Fill `[pos, rcount)` where `rcount` is the smaller value of @p count and `device_ptr::size() - pos`. * @param[in] value the fill value * @param[in] pos the position to start the fill @@ -368,30 +375,36 @@ class gpu_device_ptr { plssvm::shape padding_{}; /// The device pointer pointing to the managed memory. device_pointer_type data_{}; + /// If true, use USM allocations automatically migrating the data between host and device. + bool use_usm_allocations_{}; }; template -gpu_device_ptr::gpu_device_ptr(const size_type size, const queue_type queue) : +gpu_device_ptr::gpu_device_ptr(const size_type size, const queue_type queue, const bool use_usm_allocations) : queue_{ queue }, - shape_{ plssvm::shape{ size, 1 } } { } + shape_{ plssvm::shape{ size, 1 } }, + use_usm_allocations_{ use_usm_allocations } { } template -gpu_device_ptr::gpu_device_ptr(const plssvm::shape shape, const queue_type queue) : +gpu_device_ptr::gpu_device_ptr(const plssvm::shape shape, const queue_type queue, const bool use_usm_allocations) : queue_{ queue }, - shape_{ shape } { } + shape_{ shape }, + use_usm_allocations_{ use_usm_allocations } { } template -gpu_device_ptr::gpu_device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type queue) : +gpu_device_ptr::gpu_device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type queue, const bool use_usm_allocations) : queue_{ queue }, shape_{ shape }, - padding_{ padding } { } + padding_{ padding }, + use_usm_allocations_{ use_usm_allocations } { } template gpu_device_ptr::gpu_device_ptr(gpu_device_ptr &&other) noexcept : queue_{ std::exchange(other.queue_, queue_type{}) }, shape_{ std::exchange(other.shape_, plssvm::shape{}) }, padding_{ std::exchange(other.padding_, plssvm::shape{}) }, - data_{ std::exchange(other.data_, device_pointer_type{}) } { } + data_{ std::exchange(other.data_, device_pointer_type{}) }, + use_usm_allocations_{ std::exchange(other.use_usm_allocations_, false) } { } template auto gpu_device_ptr::operator=(gpu_device_ptr &&other) noexcept -> gpu_device_ptr & { @@ -401,6 +414,7 @@ auto gpu_device_ptr::opera shape_ = std::exchange(other.shape_, plssvm::shape{}); padding_ = std::exchange(other.padding_, plssvm::shape{}); data_ = std::exchange(other.data_, device_pointer_type{}); + use_usm_allocations_ = std::exchange(other.use_usm_allocations_, false); } return *this; } @@ -411,6 +425,7 @@ void gpu_device_ptr::swap( std::swap(shape_, other.shape_); std::swap(padding_, other.padding_); std::swap(data_, other.data_); + std::swap(use_usm_allocations_, other.use_usm_allocations_); } template diff --git a/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp index 63e9f9831..a1dc4864a 100644 --- a/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp +++ b/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp @@ -13,10 +13,11 @@ #define PLSSVM_BACKENDS_STDPAR_KERNEL_CG_EXPLICIT_BLAS_HPP_ #pragma once -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} -#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT -#include "plssvm/matrix.hpp" // plssvm::soa_matrix -#include "plssvm/shape.hpp" // plssvm::shape +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/matrix.hpp" // plssvm::soa_matrix +#include "plssvm/shape.hpp" // plssvm::shape +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include // std::for_each #include // std::array @@ -29,157 +30,218 @@ namespace plssvm::stdpar::detail { /** - * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a symmetric matrix (memory optimized), @p B and @p C are matrices, and @p alpha and @p beta are scalars. - * @param[in] num_rows the number of rows in @p A and @p C - * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for - * @param[in] alpha the scalar alpha value - * @param[in] A the matrix @p A - * @param[in] B the matrix @p B - * @param[in] beta the scalar beta value - * @param[in,out] C the matrix @p C, also used as result matrix + * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. + * @tparam target the target platform */ -inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); - PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); - - // calculate constants - const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); - - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // define range over which should be iterated - std::vector range(blocked_num_rhs * blocked_device_specific_num_rows); - std::iota(range.begin(), range.end(), 0); - - std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) { - // calculate the indices used in the current thread - const std::size_t rhs = idx / blocked_device_specific_num_rows; - const std::size_t row = idx % blocked_device_specific_num_rows; - - const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz; - - // create a thread private array used for internal caching - std::array, INTERNAL_BLOCK_SIZE> temp{}; - - // iterate over all features - for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) { - // perform the dot product calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - real_type A_val = 0.0; - // determine on which side of the diagonal we are located - if (dim < global_row) { - A_val = A_ptr[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }]; - } else { - A_val = A_ptr[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }]; +template +struct device_kernel_symm { + /** + * @brief Perform an explicit BLAS SYMM operation. + * @param[in] num_rows the number of rows in @p A and @p C + * @param[in] num_rhs the number of columns in @p B and @p C + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for + * @param[in] alpha the scalar alpha value + * @param[in] A the matrix @p A + * @param[in] B the matrix @p B + * @param[in] beta the scalar beta value + * @param[in,out] C the matrix @p C, also used as result matrix + */ + void operator()(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); + PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); + PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); + + // calculate constants + const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // define the range over which should be iterated + std::vector range(blocked_num_rhs * blocked_device_num_rows); + std::iota(range.begin(), range.end(), 0); + + std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A, B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) { + // calculate the indices used in the current thread + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + // iterate over all values + for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + real_type A_cache = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_cache = A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache = A_ptr[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + sum += A_cache * B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type A_cache = 0.0; + // determine on which side of the diagonal we are located + if (dim_block + dim < global_j_idx) { + A_cache = A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }]; + } else { + A_cache = A_ptr[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }]; + } + temp[internal_i][internal_j] += A_cache * B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; + } + } } - temp[internal_i][internal_j] += A_val * B_ptr[(dim + row_offset) * (num_rhs + PADDING_SIZE_uz) + global_rhs]; } } - } - - // apply the (partial) BLAS operation and update C - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t device_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) { - C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs]; + + // apply the (partial) BLAS operation and update C + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) { + C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; + } } } - } - }); -} + }); + } +}; /** * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars. - * @param[in] num_rows the number of rows in @p A and @p C - * @param[in] num_rhs the number of columns in @p B and @p C - * @param[in] num_mirror_rows the number of rows to mirror down - * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices - * @param[in] row_offset the first row this device is responsible for - * @param[in] alpha the scalar alpha value - * @param[in] A the matrix @p A - * @param[in] B the matrix @p B - * @param[in] beta the scalar beta value - * @param[in,out] C the matrix @p C, also used as result matrix + * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for! + * @tparam target the target platform */ -inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector &A, const soa_matrix &B, const real_type beta, soa_matrix &C) { - // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar - PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!"); - PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); - PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows); - PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows); - PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows); - - // calculate constants - const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); - const auto blocked_num_mirror_rows = static_cast(std::ceil(static_cast(num_mirror_rows) / INTERNAL_BLOCK_SIZE)); - - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // define range over which should be iterated - std::vector range(blocked_num_rhs * blocked_num_mirror_rows); // define range over which should be iterated - std::iota(range.begin(), range.end(), 0); - - std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) { - // calculate the indices used in the current thread - const std::size_t rhs = idx / blocked_num_mirror_rows; - const std::size_t row = idx % blocked_num_mirror_rows; - - const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz; - const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz; - - // create a thread private array used for internal caching - std::array, INTERNAL_BLOCK_SIZE> temp{}; - - // iterate over all features - for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) { - // perform the dot product calculation - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t global_row = row_idx + static_cast(internal_j); - - const real_type A_val = A_ptr[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row]; - temp[internal_i][internal_j] += A_val * B_ptr[(dim + row_offset) * (num_rhs + PADDING_SIZE_uz) + global_rhs]; +template +struct device_kernel_symm_mirror { + /** + * @brief Perform an explicit BLAS SYMM operation. + * @param[in] num_rows the number of rows in @p A and @p C + * @param[in] num_rhs the number of columns in @p B and @p C + * @param[in] num_mirror_rows the number of rows to mirror down + * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices + * @param[in] device_row_offset the first row this device is responsible for + * @param[in] alpha the scalar alpha value + * @param[in] A the matrix @p A + * @param[in] B the matrix @p B + * @param[in] beta the scalar beta value + * @param[in,out] C the matrix @p C, also used as result matrix + */ + void operator()(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix &B, const real_type beta, soa_matrix &C) { + // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar + PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!"); + PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows); + PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows); + PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows); + PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows); + PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows); + + // calculate constants + const auto blocked_num_rhs = static_cast(std::ceil(static_cast(num_rhs) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_mirror_rows = static_cast(std::ceil(static_cast(num_mirror_rows) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // define the range over which should be iterated + std::vector range(blocked_num_rhs * blocked_num_mirror_rows); + std::iota(range.begin(), range.end(), 0); + + std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A, B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) { + // calculate the indices used in the current thread + const std::size_t i_idx = (idx / blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz; + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + // iterate over the remaining values + for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the dim is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + sum += A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] * // SoA, upper triangular matrix only + B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the dot product calculation, the dim is the slowest moving index + for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto global_j_idx = j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] * // SoA, upper triangular matrix only + B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; // SoA + } + } + } } } - } - - // apply the (partial) BLAS operation and update C - for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { - for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { - const std::size_t global_rhs = rhs_idx + static_cast(internal_i); - const std::size_t partial_global_row = row_idx + static_cast(internal_j); - const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast(internal_j); - - // be sure to not perform out of bounds accesses - if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) { - C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs]; + + // apply the (remaining) BLAS operation and update C + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto global_i_idx = i_idx + static_cast(internal_i); + const auto partial_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx; + + // be sure to not perform out-of-bounds accesses + if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) { + C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx]; + } } } - } - }); -} + }); + } +}; } // namespace plssvm::stdpar::detail diff --git a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp index 93772aab3..fdb869351 100644 --- a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp +++ b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp @@ -14,14 +14,15 @@ #pragma once #include "plssvm/backends/stdpar/kernel/kernel_functions.hpp" // plssvm::stdpar::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type -#include "plssvm/matrix.hpp" // plssvm::aos_matrix +#include "plssvm/matrix.hpp" // plssvm::soa_matrix +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include // std::for_each #include // std::array -#include // std::ceil, std::sqrt +#include // std::ceil #include // std::size_t #include // std::execution::par_unseq #include // std::iota @@ -30,88 +31,118 @@ namespace plssvm::stdpar::detail { /** - * @brief Assemble the kernel matrix using the @p kernel function. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function - * @param[out] kernel_matrix the resulting kernel matrix - * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for - * @param[in] q the `q` vector - * @param[in] QA_cost he bottom right matrix entry multiplied by cost - * @param[in] cost 1 / the cost parameter in the C-SVM - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @brief Create the explicit kernel matrix using the @p kernel_function. + * @tparam target the target platform + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function */ -template -void device_kernel_assembly(std::vector &kernel_matrix, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { - PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!"); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); - PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); - - // calculate constants - const std::size_t num_rows = data.num_rows() - 1; - const std::size_t num_features = data.num_cols(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); - - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // count the number of entries in the final index list - std::vector indices(blocked_row_range * blocked_device_specific_num_rows); // define range over which should be iterated - std::iota(indices.begin(), indices.end(), 0); - - std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), kernel_matrix_ptr = kernel_matrix.data()](const std::size_t idx) { - // calculate the indices used in the current thread - const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; - - // only calculate the upper triangular matrix - if (row_idx >= col_idx) { - // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs - // create a thread private array used for internal caching - std::array, INTERNAL_BLOCK_SIZE> temp{}; - - // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - temp[internal_row][internal_col] += detail::feature_reduce(data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_row], data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_col]); +template +struct device_kernel_assembly { + /** + * @brief Assemble the kernel matrix using the specified kernel function. + * @param[out] kernel_matrix the resulting kernel matrix + * @param[in] data the data matrix + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for + * @param[in] q the `q` vector + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] kernel_function_parameter the potential additional arguments for the kernel function + */ + void operator()(real_type *kernel_matrix, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) { + PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!"); + PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); + PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); + + // calculate constants + const std::size_t num_rows = data.num_rows() - 1; + const std::size_t num_features = data.num_cols(); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // define the range over which should be iterated + std::vector indices(blocked_row_range * blocked_device_num_rows); + std::iota(indices.begin(), indices.end(), 0); + + std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), kernel_matrix_ptr = kernel_matrix](const std::size_t idx) { + // calculate the indices used in the current thread + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + + // only calculate the upper triangular matrix + if (i_idx >= j_idx) { + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + // iterate over all features + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target != target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += detail::feature_reduce(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + } + } } } - } - // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - // calculate the indices to access the kernel matrix (the part stored on the current device) - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q_ptr[global_row] - q_ptr[global_col]; - // apply the cost on the diagonal - if (global_row == global_col) { - temp_ij += cost; + // apply the remaining part of the kernel function and store the value in the output kernel matrix + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + real_type temp_ij = temp[internal_i][internal_j]; + // apply the final kernel function + temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q_ptr[global_i_idx] - q_ptr[global_j_idx]; + // apply the cost on the diagonal + if (global_i_idx == global_j_idx) { + temp_ij += cost; + } + // update the upper triangular kernel matrix + kernel_matrix_ptr[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij; } - kernel_matrix_ptr[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij; } } } - } - }); -} + }); + } +}; } // namespace plssvm::stdpar::detail diff --git a/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp index fdebd9cb5..8aaa10792 100644 --- a/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp +++ b/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp @@ -18,8 +18,8 @@ #include "plssvm/constants.hpp" // plssvm::real_type #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type -#include "plssvm/kernel_functions.hpp" // plssvm::kernel_function -#include "plssvm/matrix.hpp" // aos_matrix +#include "plssvm/matrix.hpp" // plssvm::soa_matrix +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include // std::for_each #include // std::array @@ -32,100 +32,152 @@ namespace plssvm::stdpar::detail { /** - * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. - * @tparam kernel the compile-time kernel function to use - * @tparam Args the types of the potential additional arguments for the @p kernel function - * @param[in] alpha the scalar alpha value - * @param[in] q the `q` vector - * @param[in] data the data matrix - * @param[in] device_specific_num_rows the number of rows the current device is responsible for - * @param[in] row_offset the first row in @p data the current device is responsible for - * @param[in] QA_cost he bottom right matrix entry multiplied by cost - * @param[in] cost 1 / the cost parameter in the C-SVM - * @param[in] B the matrix @p B - * @param[in,out] C the matrix @p C - * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function + * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar. + * @tparam target the target platform + * @tparam kernel_function the type of the used kernel function + * @tparam Args the types of the parameters necessary for the specific kernel function */ -template -inline void device_kernel_assembly_symm(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { - PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); - PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size()); - PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size()); - PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); - PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!"); - PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size()); - - // calculate constants - const std::size_t num_rows = data.num_rows() - 1; - const std::size_t num_features = data.num_cols(); - const std::size_t num_classes = B.num_rows(); - const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - row_offset) / INTERNAL_BLOCK_SIZE)); - const auto blocked_device_specific_num_rows = static_cast(std::ceil(static_cast(device_specific_num_rows) / INTERNAL_BLOCK_SIZE)); - - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // count the number of entries in the final index list - std::vector indices(blocked_row_range * blocked_device_specific_num_rows); // define range over which should be iterated - std::iota(indices.begin(), indices.end(), 0); - - std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) { - // calculate the indices used in the current thread - const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; - const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz; - - // only calculate the upper triangular matrix - if (row_idx >= col_idx) { - // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs - // create a thread private array used for internal caching - std::array, INTERNAL_BLOCK_SIZE> temp{}; - - // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - temp[internal_row][internal_col] += detail::feature_reduce(data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_row], data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_col]); +template +struct device_kernel_assembly_symm { + /** + * @brief Perform an implicit BLAS SYMM-like operation. + * @param[in] alpha the scalar alpha value + * @param[in] q the `q` vector + * @param[in] data the data matrix + * @param[in] device_num_rows the number of rows the current device is responsible for + * @param[in] device_row_offset the first row in @p data the current device is responsible for + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] B the matrix @p B + * @param[in,out] C the matrix @p C + * @param[in] kernel_function_parameter the potential additional arguments for the kernel function + */ + void operator()(const real_type alpha, const std::vector &q, const soa_matrix &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix &B, soa_matrix &C, Args... kernel_function_parameter) { + PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1); + PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size()); + PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size()); + PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!"); + PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!"); + PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size()); + + // calculate constants + const std::size_t num_rows = data.num_rows() - 1; + const std::size_t num_features = data.num_cols(); + const std::size_t num_classes = B.num_rows(); + const auto blocked_row_range = static_cast(std::ceil(static_cast(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_rows = static_cast(std::ceil(static_cast(device_num_rows) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // define the range over which should be iterated + std::vector indices(blocked_row_range * blocked_device_num_rows); + std::iota(indices.begin(), indices.end(), 0); + + std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) { + // calculate the indices used in the current thread + const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz; + + // only calculate the upper triangular matrix + if (i_idx >= j_idx) { + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + //*************************************************************************// + // inplace kernel matrix construction // + //*************************************************************************// + // iterate over all features + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + temp[internal_i][internal_j] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + temp[internal_i][internal_j] += detail::feature_reduce(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx], // SoA + data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]); // SoA + } + } + } } } - } - // apply the remaining part of the kernel function and store the value in the output kernel matrix - for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) { - for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) { - const std::size_t device_global_row = row_idx + static_cast(internal_row); - const std::size_t global_row = row_offset + row_idx + static_cast(internal_row); - const std::size_t device_global_col = col_idx + static_cast(internal_col); - const std::size_t global_col = row_offset + col_idx + static_cast(internal_col); - - // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix) - if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) { - real_type temp_ij = temp[internal_row][internal_col]; - temp_ij = detail::apply_kernel_function(temp_ij, kernel_function_parameter...) + QA_cost - q_ptr[global_row] - q_ptr[global_col]; - // apply the cost on the diagonal - if (global_row == global_col) { - temp_ij += cost; - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { - atomic_ref{ C_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx] } += alpha * temp_ij * B_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx]; + // apply the remaining part of the kernel function and store the value in the output kernel matrix + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data and the data with respect to the current device + const auto device_global_i_idx = i_idx + static_cast(internal_i); + const auto global_i_idx = device_row_offset + device_global_i_idx; + const auto device_global_j_idx = j_idx + static_cast(internal_j); + const auto global_j_idx = device_row_offset + device_global_j_idx; + + // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix) + if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) { + // apply the final kernel function + temp[internal_i][internal_j] = detail::apply_kernel_function(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q_ptr[global_i_idx] - q_ptr[global_j_idx]; + // apply the cost on the diagonal + if (global_i_idx == global_j_idx) { + temp[internal_i][internal_j] += cost; } } else { - // calculate the values of alpha * A * B - for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { - atomic_ref{ C_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx] } += alpha * temp_ij * B_ptr[global_col * (num_classes + PADDING_SIZE_uz) + class_idx]; - // symmetry - atomic_ref{ C_ptr[global_col * (num_classes + PADDING_SIZE_uz) + class_idx] } += alpha * temp_ij * B_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx]; + // be sure to set the value to zero otherwise + temp[internal_i][internal_j] = real_type{ 0.0 }; + } + } + } + + //*************************************************************************// + // calculate C += alpha * temp * B // + //*************************************************************************// + for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) { + for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) { + for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) { + // calculate the indices to access the global data + const auto global_i_idx = device_row_offset + i_idx + static_cast(internal_i); + const auto global_j_idx = device_row_offset + j_idx + static_cast(internal_j); + + if (global_i_idx == global_j_idx) { + // only apply once to the diagonal + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + atomic_ref{ C_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx] } += alpha * temp[internal_i][internal_j] * B_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx]; + } + } else { + // apply it for the upper and lower triangular matrix + for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) { + atomic_ref{ C_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx] } += alpha * temp[internal_i][internal_j] * B_ptr[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx]; + // symmetry + atomic_ref{ C_ptr[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx] } += alpha * temp[internal_i][internal_j] * B_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx]; + } } } } } } - } - }); -} + }); + } +}; } // namespace plssvm::stdpar::detail diff --git a/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp b/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp index e652d1160..b77e7a338 100644 --- a/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp +++ b/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp @@ -38,42 +38,17 @@ namespace plssvm::stdpar::detail { /** * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values. - * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise. * @param[in] base the base * @param[in] exponent the exponent * @return base^exponent (`[[nodiscard]]`) */ [[nodiscard]] inline PLSSVM_STDPAR_KERNEL_FUNCTION real_type powi(const real_type base, const int exponent) { - switch (exponent) { - case 0: return real_type{ 1.0 }; - case 1: return base; - case 2: return base * base; - case 3: return base * base * base; - case 4: - { - const real_type temp = base * base; - return temp * temp; - } - case 5: - { - const real_type temp = base * base; - return temp * temp * base; - } - case 6: - { - const real_type temp = base * base * base; - return temp * temp; - } - default: - { - // generic integer power function - real_type result{ 1.0 }; - for (int i = 0; i < exponent; ++i) { - result *= base; - } - return result; - } + // generic integer power function + real_type result{ 1.0 }; + for (int i = 0; i < exponent; ++i) { + result *= base; } + return result; } //***************************************************// diff --git a/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp b/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp index ce46e6a1c..4b487dce2 100644 --- a/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp +++ b/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp @@ -15,15 +15,16 @@ #include "plssvm/backends/stdpar/detail/utility.hpp" // plssvm::stdpar::detail::atomic_ref #include "plssvm/backends/stdpar/kernel/kernel_functions.hpp" // plssvm::stdpar::detail::{feature_reduce, apply_kernel_function} -#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE} +#include "plssvm/constants.hpp" // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE} #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/kernel_function_types.hpp" // plssvm::kernel_function_type #include "plssvm/matrix.hpp" // plssvm::aos_matrix, plssvm::soa_matrix #include "plssvm/shape.hpp" // plssvm::shape +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include // std::for_each #include // std::array -#include // std::fma +#include // std::ceil #include // std::size_t #include // std::execution::par_unseq #include // std::iota @@ -33,230 +34,305 @@ namespace plssvm::stdpar::detail { /** * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. - * @param[out] w the vector to speedup the linear prediction - * @param[in] alpha the previously learned weights - * @param[in] support_vectors the support vectors - * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for - * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for + * @tparam target the target platform */ -inline void device_kernel_w_linear(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) { - PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); - PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() })); - PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows()); - PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows()); - - // calculate constants - const std::size_t num_classes = alpha.num_rows(); - const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); - const std::size_t num_features = support_vectors.num_cols(); - const auto blocked_num_features = static_cast(std::ceil(static_cast(num_features) / INTERNAL_BLOCK_SIZE)); - const std::size_t num_support_vectors = support_vectors.num_rows(); - - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // define range over which should be iterated - std::vector range(blocked_num_classes * blocked_num_features); - std::iota(range.begin(), range.end(), 0); - - std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, w_ptr = w.data(), alpha_ptr = alpha.data(), sv_ptr = support_vectors.data()](const std::size_t idx) { - // calculate the indices used in the current thread - const std::size_t feature = idx / blocked_num_classes; - const std::size_t c = idx % blocked_num_classes; - - const std::size_t feature_idx = feature * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz; - - // create a thread private array used for internal caching - std::array, INTERNAL_BLOCK_SIZE> temp{}; - - // iterate over all features - for (std::size_t sv = 0; sv < device_specific_num_sv; ++sv) { - // perform the feature reduction calculation - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_feature_idx = feature_idx + static_cast(internal_feature); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); - - temp[internal_feature][internal_class] += alpha_ptr[global_class_idx * (num_support_vectors + PADDING_SIZE_uz) + sv_offset + sv] * sv_ptr[global_feature_idx * (num_support_vectors + PADDING_SIZE_uz) + sv_offset + sv]; +template +struct device_kernel_w_linear { + /** + * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function. + * @param[out] w the vector to speedup the linear prediction + * @param[in] alpha the previously learned weights + * @param[in] support_vectors the support vectors + * @param[in] device_num_sv the number of support vectors the current device is responsible for + * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for + */ + void operator()(soa_matrix &w, const aos_matrix &alpha, const soa_matrix &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) { + PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); + PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() })); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows()); + + // calculate constants + const std::size_t num_classes = alpha.num_rows(); + const std::size_t num_features = support_vectors.num_cols(); + const std::size_t num_sv = support_vectors.num_rows(); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_features = static_cast(std::ceil(static_cast(num_features) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // define the range over which should be iterated + std::vector range(blocked_num_classes * blocked_num_features); + std::iota(range.begin(), range.end(), 0); + + std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, w_ptr = w.data(), alpha_ptr = alpha.data(), support_vectors_ptr = support_vectors.data()](const std::size_t idx) { + // calculate the indices used in the current thread + const std::size_t feature_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + // iterate over all support vectors using blocking + for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the sv is the fastest moving index + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + sum += alpha_ptr[global_class_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv] * // AoS + support_vectors_ptr[global_feature_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv]; // SoA + } + temp[internal_feature][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the sv is the slowest moving index + for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) { + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); + + temp[internal_feature][internal_class] += alpha_ptr[global_class_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv] * // AoS + support_vectors_ptr[global_feature_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv]; // SoA + } + } + } } } - } - // update global array with local one - for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_feature_idx = feature_idx + static_cast(internal_feature); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // store the result back to the w vector + for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_feature_idx = feature_idx + static_cast(internal_feature); + const auto global_class_idx = class_idx + static_cast(internal_class); - w_ptr[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; + w_ptr[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class]; + } } - } - }); -} + }); + } +}; /** - * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector. - * @param[out] prediction the predicted values - * @param[in] w the vector to speedup the calculations - * @param[in] rho the previously learned bias - * @param[in] predict_points the data points to predict - * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for - * @param[in] row_offset the first row in @p predict_points the current device is responsible for + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. + * @tparam target the target platform */ -inline void device_kernel_predict_linear(aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) { - PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size()); - PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols()); - PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); - - // calculate constants - const std::size_t num_predict_points = predict_points.num_rows(); - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); - const std::size_t num_classes = prediction.num_cols(); - const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); - const std::size_t num_features = predict_points.num_cols(); - - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // define range over which should be iterated - std::vector range(blocked_device_specific_num_predict_points * blocked_num_classes); - std::iota(range.begin(), range.end(), 0); - - std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), w_ptr = w.data(), rho_ptr = rho.data(), pp_ptr = predict_points.data()](const std::size_t idx) { - // calculate the indices used in the current thread - const std::size_t pp = idx / blocked_num_classes; - const std::size_t c = idx % blocked_num_classes; - - const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz; - const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz; - - // create a thread private array used for internal caching - std::array, INTERNAL_BLOCK_SIZE> temp{}; - - // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); - - temp[internal_pp][internal_class] += w_ptr[dim * (num_classes + PADDING_SIZE_uz) + global_class_idx] * pp_ptr[dim * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]; +template +struct device_kernel_predict_linear { + /** + * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector. + * @param[out] prediction the predicted values + * @param[in] w the vector to speedup the calculations + * @param[in] rho the previously learned bias + * @param[in] predict_points the data points to predict + * @param[in] device_num_predict_points the number of predict points the current device is responsible for + * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for + */ + void operator()(aos_matrix &prediction, const soa_matrix &w, const std::vector &rho, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) { + PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size()); + PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols()); + PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() })); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); + + // calculate constants + const std::size_t num_predict_points = predict_points.num_rows(); + const std::size_t num_classes = prediction.num_cols(); + const std::size_t num_features = predict_points.num_cols(); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); + const auto blocked_num_classes = static_cast(std::ceil(static_cast(num_classes) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // define the range over which should be iterated + std::vector range(blocked_device_num_predict_points * blocked_num_classes); + std::iota(range.begin(), range.end(), 0); + + std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), w_ptr = w.data(), rho_ptr = rho.data(), predict_points_ptr = predict_points.data()](const std::size_t idx) { + // calculate the indices used in the current thread + const std::size_t pp_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; + const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz; + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + // iterate over all features using blocking + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the dot product calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += w_ptr[(feature_block + feature) * (num_classes + PADDING_SIZE_uz) + global_class_idx] * // SoA + predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]; // SoA + } + temp[internal_pp][internal_class] += sum; + } + } + } else { + // perform the dot product calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); + + temp[internal_pp][internal_class] += w_ptr[(feature_block + feature) * (num_classes + PADDING_SIZE_uz) + global_class_idx] * // SoA + predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]; // SoA + } + } + } } } - } - // perform the dot product calculation - for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { - for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_class_idx = class_idx + static_cast(internal_class); + // update the global array with the local one + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) { + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_class_idx = class_idx + static_cast(internal_class); - if (device_global_pp_idx < device_specific_num_predict_points && global_class_idx < num_classes) { prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_ptr[global_class_idx]; } } - } - }); -} + }); + } +}; /** - * @brief Predict the @p predict_points_d using the @p kernel_function. - * @tparam kernel the type of the used kernel function + * @brief Predict the @p predict_points using the @p kernel_function. + * @tparam target the target platform + * @tparam kernel_function the type of the used kernel function * @tparam Args the types of the parameters necessary for the specific kernel function - * @param[out] prediction the predicted values - * @param[in] alpha the previously learned weights - * @param[in] rho the previously learned bias - * @param[in] support_vectors the support vectors - * @param[in] predict_points the data points to predict - * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for - * @param[in] row_offset the first row in @p predict_points the current device is responsible for - * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function */ -template -inline void device_kernel_predict(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) { - PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size()); - PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); - PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols()); - PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() })); - PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows()); - PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows()); - - // calculate constants - const std::size_t num_classes = alpha.num_rows(); - const std::size_t num_support_vectors = support_vectors.num_rows(); - const auto blocked_num_support_vectors = static_cast(std::ceil(static_cast(num_support_vectors) / INTERNAL_BLOCK_SIZE)); - const std::size_t num_predict_points = predict_points.num_rows(); - const auto blocked_device_specific_num_predict_points = static_cast(std::ceil(static_cast(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE)); - const std::size_t num_features = predict_points.num_cols(); - - // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows - const auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); - const auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); - - // define range over which should be iterated - std::vector range(blocked_device_specific_num_predict_points * blocked_num_support_vectors); - std::iota(range.begin(), range.end(), 0); - - std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), alpha_ptr = alpha.data(), rho_ptr = rho.data(), sv_ptr = support_vectors.data(), pp_ptr = predict_points.data()](const std::size_t idx) { - // calculate the indices used in the current thread - const std::size_t pp = idx / blocked_num_support_vectors; - const std::size_t sv = idx % blocked_num_support_vectors; - - const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz; - const std::size_t sv_idx = sv * INTERNAL_BLOCK_SIZE_uz; - - // create a thread private array used for internal caching - std::array, INTERNAL_BLOCK_SIZE> temp{}; - - // iterate over all features - for (std::size_t dim = 0; dim < num_features; ++dim) { - // perform the feature reduction calculation - for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); - - temp[internal_pp][internal_sv] += detail::feature_reduce(sv_ptr[dim * (num_support_vectors + PADDING_SIZE_uz) + global_sv_idx], - pp_ptr[dim * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]); +template +struct device_kernel_predict { + /** + * @brief Predict the @p predict_points using the kernel function. + * @param[out] prediction the predicted values + * @param[in] alpha the previously learned weights + * @param[in] rho the previously learned bias + * @param[in] support_vectors the support vectors + * @param[in] predict_points the data points to predict + * @param[in] device_num_predict_points the number of predict points the current device is responsible for + * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for + * @param[in] kernel_function_parameter the parameters necessary to apply the kernel function + */ + void operator()(aos_matrix &prediction, const aos_matrix &alpha, const std::vector &rho, const soa_matrix &support_vectors, const soa_matrix &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) { + PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size()); + PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows()); + PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols()); + PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() })); + PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows()); + PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows()); + + // calculate constants + const std::size_t num_classes = alpha.num_rows(); + const std::size_t device_num_sv = support_vectors.num_rows(); + const std::size_t num_features = predict_points.num_cols(); + const std::size_t num_predict_points = predict_points.num_rows(); + const auto blocked_device_num_sv = static_cast(std::ceil(static_cast(device_num_sv) / INTERNAL_BLOCK_SIZE)); + const auto blocked_device_num_predict_points = static_cast(std::ceil(static_cast(device_num_predict_points) / INTERNAL_BLOCK_SIZE)); + + // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows + constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast(INTERNAL_BLOCK_SIZE); + constexpr auto THREAD_BLOCK_SIZE_uz = static_cast(THREAD_BLOCK_SIZE); + constexpr auto PADDING_SIZE_uz = static_cast(PADDING_SIZE); + + // define the range over which should be iterated + std::vector range(blocked_device_num_predict_points * blocked_device_num_sv); + std::iota(range.begin(), range.end(), 0); + + std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), alpha_ptr = alpha.data(), rho_ptr = rho.data(), support_vectors_ptr = support_vectors.data(), predict_points_ptr = predict_points.data()](const std::size_t idx) { + // calculate the indices used in the current thread + const std::size_t pp_idx = (idx / blocked_device_num_sv) * INTERNAL_BLOCK_SIZE_uz; // num_predict_points + const std::size_t sv_idx = (idx % blocked_device_num_sv) * INTERNAL_BLOCK_SIZE_uz; // num_support_vectors + + // create a thread private array used for internal caching + std::array, INTERNAL_BLOCK_SIZE> temp{}; + + // iterate over all features + for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) { + if constexpr (target == target_platform::cpu) { + // perform the feature reduction calculation, the feature is the fastest moving index + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); + + real_type sum{ 0.0 }; + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + sum += detail::feature_reduce(support_vectors_ptr[(feature_block + feature) * (device_num_sv + PADDING_SIZE_uz) + global_sv_idx], // SoA + predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]); // SoA + } + temp[internal_pp][internal_sv] += sum; + } + } + } else { + // perform the feature reduction calculation, the feature is the slowest moving index + for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) { + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + // calculate the indices to access the global data + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); + + temp[internal_pp][internal_sv] += detail::feature_reduce(support_vectors_ptr[(feature_block + feature) * (device_num_sv + PADDING_SIZE_uz) + global_sv_idx], // SoA + predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]); // SoA + } + } + } } } - } - // update temp using the respective kernel function - for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { - for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); + // update temp using the respective kernel function + for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { + for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { + temp[internal_pp][internal_sv] = detail::apply_kernel_function(temp[internal_pp][internal_sv], kernel_function_parameter...); + } } - } - // add results to prediction - for (std::size_t a = 0; a < num_classes; ++a) { + // atomically add the results to the prediction for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) { for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) { - const std::size_t device_global_pp_idx = pp_idx + static_cast(internal_pp); - const std::size_t global_pp_idx = row_offset + pp_idx + static_cast(internal_pp); - const std::size_t global_sv_idx = sv_idx + static_cast(internal_sv); + // calculate the indices to access the global data and the data with respect to the current device + const auto global_pp_idx = device_row_offset + pp_idx + static_cast(internal_pp); + const auto global_sv_idx = sv_idx + static_cast(internal_sv); - // be sure to not perform out of bounds accesses - if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) { + for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) { if (global_sv_idx == 0) { - atomic_ref{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + a] } += -rho_ptr[a]; + atomic_ref{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_idx] } += -rho_ptr[class_idx]; } - atomic_ref{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + a] } += - temp[internal_pp][internal_sv] * alpha_ptr[a * (num_support_vectors + PADDING_SIZE_uz) + global_sv_idx]; + atomic_ref{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_idx] } += + temp[internal_pp][internal_sv] * alpha_ptr[class_idx * (device_num_sv + PADDING_SIZE_uz) + global_sv_idx]; } } } - } - }); -} + }); + } +}; } // namespace plssvm::stdpar::detail diff --git a/include/plssvm/constants.hpp b/include/plssvm/constants.hpp index e99dbeddd..81d992991 100644 --- a/include/plssvm/constants.hpp +++ b/include/plssvm/constants.hpp @@ -38,11 +38,8 @@ constexpr unsigned INTERNAL_BLOCK_SIZE = PLSSVM_INTERNAL_BLOCK_SIZE; constexpr unsigned INTERNAL_BLOCK_SIZE = 4; #endif -/// Global compile time constant used for internal feature caching. -constexpr unsigned FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE; - -/// Padding used for the device w_d matrix to prevent out-of-bounce accesses without ifs. -constexpr unsigned PADDING_SIZE = FEATURE_BLOCK_SIZE > (THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) ? FEATURE_BLOCK_SIZE : (THREAD_BLOCK_SIZE *INTERNAL_BLOCK_SIZE); +/// Padding used for the device arrays and matrices to prevent out-of-bounce accesses without ifs. +constexpr unsigned PADDING_SIZE = THREAD_BLOCK_SIZE *INTERNAL_BLOCK_SIZE; // perform sanity checks static_assert(detail::tuple_contains_v, "Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types."); diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp index 6ec7773c4..726165679 100644 --- a/include/plssvm/core.hpp +++ b/include/plssvm/core.hpp @@ -13,35 +13,35 @@ #define PLSSVM_CORE_HPP_ #pragma once -#include "plssvm/backend_types.hpp" // all supported backend types -#include "plssvm/backends/SYCL/implementation_types.hpp" // the SYCL implementation type -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // the SYCL specific kernel invocation typ -#include "plssvm/classification_report.hpp" // reports different metrics (precision, recall, f1 score, and support) for the different classes after scoring -#include "plssvm/classification_types.hpp" // all supported multi-class classification strategies -#include "plssvm/constants.hpp" // verbosity flag und compile-time constants -#include "plssvm/csvm_factory.hpp" // a factory function to instantiate a C-SVM using a runtime backend; includes the available backend C-SVMs -#include "plssvm/data_set/classification_data_set.hpp" // a classification data set used for training a C-SVC -#include "plssvm/data_set/min_max_scaler.hpp" // a min-max scaler for the data sets -#include "plssvm/data_set/regression_data_set.hpp" // a regression data set used for training a C-SVR -#include "plssvm/environment.hpp" // environment management functions and classes -#include "plssvm/exceptions/exceptions.hpp" // exception hierarchy -#include "plssvm/file_format_types.hpp" // all supported file format types -#include "plssvm/gamma.hpp" // the types of the gamma parameter -#include "plssvm/kernel_function_types.hpp" // all supported kernel function types -#include "plssvm/kernel_functions.hpp" // implementation of all supported kernel functions -#include "plssvm/matrix.hpp" // a custom matrix class -#include "plssvm/model/classification_model.hpp" // the model as a result of training a C-SVC -#include "plssvm/model/regression_model.hpp" // the model as a result of training a C-SVR -#include "plssvm/mpi/communicator.hpp" // PLSSVM MPI communicator wrapper -#include "plssvm/parameter.hpp" // the C-SVM parameter -#include "plssvm/regression_report.hpp" // reports different metrics (e.g., mean squared error or R^2 score) for the regression task after scoring -#include "plssvm/shape.hpp" // shape for a matrix or device pointer -#include "plssvm/solver_types.hpp" // all supported solver types (e.g., Conjugate Gradients with explicit, streaming, or implicit kernel matrix generation) -#include "plssvm/svm/csvc.hpp" // the base C-SVC every backend is inheriting from -#include "plssvm/svm/csvr.hpp" // the base C-SVR every backend is inheriting from -#include "plssvm/target_platforms.hpp" // all supported target platforms -#include "plssvm/verbosity_levels.hpp" // all supported verbosity levels -#include "plssvm/version/version.hpp" // version information +#include "plssvm/backend_types.hpp" // all supported backend types +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // the SYCL specific data parallel kernels +#include "plssvm/backends/SYCL/implementation_types.hpp" // the SYCL implementation type +#include "plssvm/classification_report.hpp" // reports different metrics (precision, recall, f1 score, and support) for the different classes after scoring +#include "plssvm/classification_types.hpp" // all supported multi-class classification strategies +#include "plssvm/constants.hpp" // verbosity flag und compile-time constants +#include "plssvm/csvm_factory.hpp" // a factory function to instantiate a C-SVM using a runtime backend; includes the available backend C-SVMs +#include "plssvm/data_set/classification_data_set.hpp" // a classification data set used for training a C-SVC +#include "plssvm/data_set/min_max_scaler.hpp" // a min-max scaler for the data sets +#include "plssvm/data_set/regression_data_set.hpp" // a regression data set used for training a C-SVR +#include "plssvm/environment.hpp" // environment management functions and classes +#include "plssvm/exceptions/exceptions.hpp" // exception hierarchy +#include "plssvm/file_format_types.hpp" // all supported file format types +#include "plssvm/gamma.hpp" // the types of the gamma parameter +#include "plssvm/kernel_function_types.hpp" // all supported kernel function types +#include "plssvm/kernel_functions.hpp" // implementation of all supported kernel functions +#include "plssvm/matrix.hpp" // a custom matrix class +#include "plssvm/model/classification_model.hpp" // the model as a result of training a C-SVC +#include "plssvm/model/regression_model.hpp" // the model as a result of training a C-SVR +#include "plssvm/mpi/communicator.hpp" // PLSSVM MPI communicator wrapper +#include "plssvm/parameter.hpp" // the C-SVM parameter +#include "plssvm/regression_report.hpp" // reports different metrics (e.g., mean squared error or R^2 score) for the regression task after scoring +#include "plssvm/shape.hpp" // shape for a matrix or device pointer +#include "plssvm/solver_types.hpp" // all supported solver types (e.g., Conjugate Gradients with explicit, streaming, or implicit kernel matrix generation) +#include "plssvm/svm/csvc.hpp" // the base C-SVC every backend is inheriting from +#include "plssvm/svm/csvr.hpp" // the base C-SVR every backend is inheriting from +#include "plssvm/target_platforms.hpp" // all supported target platforms +#include "plssvm/verbosity_levels.hpp" // all supported verbosity levels +#include "plssvm/version/version.hpp" // version information /// The main namespace containing all public API functions. namespace plssvm { } diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp new file mode 100644 index 000000000..e69de29bb diff --git a/include/plssvm/detail/cmd/parser_predict.hpp b/include/plssvm/detail/cmd/parser_predict.hpp index 073e92f6c..8bfa7ef3e 100644 --- a/include/plssvm/detail/cmd/parser_predict.hpp +++ b/include/plssvm/detail/cmd/parser_predict.hpp @@ -13,12 +13,12 @@ #define PLSSVM_DETAIL_CMD_PARSER_PREDICT_HPP_ #pragma once -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space -#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type -#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel +#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type +#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "fmt/base.h" // fmt::formatter #include "fmt/ostream.h" // fmt::ostream_formatter @@ -47,8 +47,8 @@ struct parser_predict { /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD, or Intel. target_platform target{ target_platform::automatic }; - /// The kernel invocation type when using SYCL as backend. - sycl::kernel_invocation_type sycl_kernel_invocation_type{ sycl::kernel_invocation_type::automatic }; + /// The data parallel kernel when using SYCL as backend. + sycl::data_parallel_kernel sycl_data_parallel_kernel{ sycl::data_parallel_kernel::automatic }; /// The SYCL implementation to use with `--backend sycl`. sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic }; diff --git a/include/plssvm/detail/cmd/parser_train.hpp b/include/plssvm/detail/cmd/parser_train.hpp index 6ddae10ac..6253394a6 100644 --- a/include/plssvm/detail/cmd/parser_train.hpp +++ b/include/plssvm/detail/cmd/parser_train.hpp @@ -13,17 +13,17 @@ #define PLSSVM_DETAIL_CMD_PARSER_TRAIN_HPP_ #pragma once -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space -#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type -#include "plssvm/backends/SYCL/kernel_invocation_types.hpp" // plssvm::sycl::kernel_invocation_type -#include "plssvm/classification_types.hpp" // plssvm::classification_type -#include "plssvm/constants.hpp" // plssvm::real_type -#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/solver_types.hpp" // plssvm::solving_type -#include "plssvm/svm_types.hpp" // plssvm::svm_type -#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernel +#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type +#include "plssvm/classification_types.hpp" // plssvm::classification_type +#include "plssvm/constants.hpp" // plssvm::real_type +#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator +#include "plssvm/parameter.hpp" // plssvm::parameter +#include "plssvm/solver_types.hpp" // plssvm::solving_type +#include "plssvm/svm_types.hpp" // plssvm::svm_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include "fmt/base.h" // fmt::formatter #include "fmt/ostream.h" // mt::ostream_formatter @@ -67,8 +67,8 @@ struct parser_train { /// The used solver type for the LS-SVM kernel matrix: automatic (depending on the available (V)RAM), cg_explicit, or cg_implicit. solver_type solver{ solver_type::automatic }; - /// The kernel invocation type when using SYCL as backend. - sycl::kernel_invocation_type sycl_kernel_invocation_type{ sycl::kernel_invocation_type::automatic }; + /// The data parallel kernel when using SYCL as backend. + sycl::data_parallel_kernel sycl_data_parallel_kernel{ sycl::data_parallel_kernel::automatic }; /// The SYCL implementation to use with --backend=sycl. sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic }; diff --git a/include/plssvm/detail/cmd/utility.hpp b/include/plssvm/detail/cmd/utility.hpp new file mode 100644 index 000000000..9cc689868 --- /dev/null +++ b/include/plssvm/detail/cmd/utility.hpp @@ -0,0 +1,94 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Utility functions related to the command line parser functionality. + */ + +#ifndef PLSSVM_DETAIL_CMD_UTILITY_HPP_ +#define PLSSVM_DETAIL_CMD_UTILITY_HPP_ +#pragma once + +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/Kokkos/execution_space.hpp" // plssvm::kokkos::execution_space +#include "plssvm/backends/SYCL/data_parallel_kernels.hpp" // plssvm::sycl::data_parallel_kernels +#include "plssvm/backends/SYCL/implementation_types.hpp" // plssvm::sycl::implementation_type +#include "plssvm/mpi/communicator.hpp" // plssvm::mpi::communicator +#include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "plssvm/verbosity_levels.hpp" // plssvm::verbosity_level + +#include "cxxopts.hpp" // cxxopts::ParseResult, cxxopts::Options + +#include // std::size_t +#include // std::optional +#include // std::string +#include // std::pair +#include // std::vector + +namespace plssvm::detail::cmd { + +/** + * @brief Filter the provided command line options starting with the @p prefix_filter. + * @details Currently, per default filters out all options starting with "--hpx:" and "--kokkos-". + * @attention **ONLY** single command line options are supported! I.e., "--hpx:threads=42" is supported, but not "--hpx:threads 42". + * @param[in] argc the number of provided command line options to be filtered + * @param[in] argv the command line options to be filtered + * @param[in] prefix_filter a list of prefixes that should be filtered + * @return a `std::vector` containing all non-filtered command line options (`[[nodiscard]]`) + */ +[[nodiscard]] std::vector filter_argv(int argc, char **argv, const std::vector &prefix_filter = { "--hpx:", "--kokkos-" }); + +/** + * @brief Assemble a more detailed help message for the kernel function types also containing their mathematical formula. + * @return the kernel functions' help message (`[[nodiscard]]`) + */ +[[nodiscard]] std::string kernel_type_help_message(); + +/** + * @brief If a SYCL backend is available, parse the SYCL specific command line options "--sycl_data_parallel_kernel" and "--sycl_implementation_type". + * @details If a SYCL backend is available, returns the two parsed command line options wrapped in a `std::pair`, otherwise returns a `std::nullopt`. + * @param[in] result the cxxopts parser result encapsulating the command line options + * @param[in] comm the MPI communicator + * @param[in] backend the requested backend + * @param[in] target the requested target platform + * @return the parsed, SYCL specific command line options (`[[nodiscard]]`) + */ +[[nodiscard]] std::optional> parse_and_check_sycl_options_if_available(const cxxopts::ParseResult &result, const mpi::communicator &comm, backend_type backend, target_platform target); + +/** + * @brief If the Kokkos backend is available, parse the Kokkos specific command line option "--kokkos_execution_space". + * @details If the Kokkos backend is available, returns the parsed command line option, otherwise returns a `std::nullopt`. + * @param[in] result the cxxopts parser result encapsulating the command line option + * @param[in] comm the MPI communicator + * @param[in] backend the requested backend + * @param[in] target the requested target platform + * @return the parsed, Kokkos specific command line option (`[[nodiscard]]`) + */ +[[nodiscard]] std::optional parse_and_check_kokkos_options_if_available(const cxxopts::ParseResult &result, const mpi::communicator &comm, backend_type backend, target_platform target); + +/** + * @brief If MPI is available, parse the MPI specific command line option "--mpi_load_balancing_weights". + * @details If MPI is available, returns the parsed command line option, otherwise returns a `std::nullopt`. + * @param[in] result the cxxopts parser result encapsulating the command line option + * @param[in] options all supported command line options + * @param[in] comm the MPI communicator + * @return the parsed, MPI specific command line option (`[[nodiscard]]`) + */ +[[nodiscard]] std::optional> parse_and_check_mpi_options_if_available(const cxxopts::ParseResult &result, const cxxopts::Options &options, const mpi::communicator &comm); + +/** + * @brief Parse the verbosity command line option. + * @details If it was provided, returns the parsed value, otherwise returns a `std::nullopt`. + * @param[in] result the cxxopts parser result encapsulating the command line option + * @param[in] comm the MPI communicator + * @return the parsed verbosity command line option (`[[nodiscard]]`) + */ +[[nodiscard]] std::optional parse_verbosity(const cxxopts::ParseResult &result, const mpi::communicator &comm); + +} // namespace plssvm::detail::cmd + +#endif // PLSSVM_DETAIL_CMD_UTILITY_HPP_ diff --git a/include/plssvm/detail/data_distribution.hpp b/include/plssvm/detail/data_distribution.hpp index af4043a79..fd433bcd0 100644 --- a/include/plssvm/detail/data_distribution.hpp +++ b/include/plssvm/detail/data_distribution.hpp @@ -23,6 +23,7 @@ #include // std::size_t #include // std::ostream forward declaration #include // std::accumulate +#include // std::pair #include // std::vector namespace plssvm::detail { @@ -234,6 +235,22 @@ class triangular_data_distribution : public data_distribution { */ [[nodiscard]] std::vector calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(std::size_t num_features, std::size_t num_classes) const; + /** + * @brief Calculate the theoretical total memory needed per place for assembling the kernel matrix using USM. + * @param[in] num_features the total number of features + * @param[in] num_classes the total number of classes + * @return the theoretical total memory needed per place for cg_streaming (`[[nodiscard]]`) + */ + [[nodiscard]] std::pair> calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(std::size_t num_features, std::size_t num_classes) const; + + /** + * @brief Calculate the theoretical maximum single memory allocation size per place for assembling the kernel matrix using USM. + * @param[in] num_features the total number of features + * @param[in] num_classes the total number of classes + * @return the theoretical maximum single memory allocation size per place for cg_streaming (`[[nodiscard]]`) + */ + [[nodiscard]] std::vector calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(std::size_t num_features, std::size_t num_classes) const; + /** * @brief Calculate the theoretical total memory needed per place for implicitly assembling the kernel matrix. * @param[in] num_features the total number of features diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp new file mode 100644 index 000000000..ca58eec3a --- /dev/null +++ b/include/plssvm/detail/make_unique_for_overwrite.hpp @@ -0,0 +1,125 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + */ + +#ifndef PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ +#define PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ + +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT + +#include // std::size_t +#include // std::memset +#include // std::unique_ptr +#include // std::false_type, std::true_type, std::enable_if_t, std::is_array_v + +namespace plssvm::detail { + +/** + * @brief Helper struct to check whether @p T is an unbounded array. + * @tparam T the array type + */ +template +struct is_unbounded_array : std::false_type { }; + +/** + * @brief Specialization of @ref plssvm::detail::is_unbounded_array for unbounded arrays. + * @tparam T the array type + */ +template +struct is_unbounded_array : std::true_type { }; + +/** + * @brief Shortcut for @ref plssvm::detail::is_unbounded_array. + * @tparam T the array type + */ +template +constexpr bool is_unbounded_array_v = is_unbounded_array::value; + +/** + * @brief Helper struct to check whether @p T is a bounded array. + * @tparam T the array type + */ +template +struct is_bounded_array : std::false_type { }; + +/** + * @brief Specialization of @ref plssvm::detail::is_bounded_array for unbounded arrays. + * @tparam T the array type + * @tparam N the size of the array + */ +template +struct is_bounded_array : std::true_type { }; + +/** + * @brief Shortcut for @ref plssvm::detail::is_bounded_array. + * @tparam T the array type + */ +template +constexpr bool is_bounded_array_v = is_bounded_array::value; + +/** + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + * @tparam T the type of the object to create + * @return a unique pointer to the newly created object (`[[nodiscard]]`) + */ +template , bool> = true> +[[nodiscard]] std::unique_ptr make_unique_for_overwrite() { + return std::unique_ptr(new T); +} + +/** + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + * @tparam T the type of the objects to create + * @param[in] n the size of the array to create + * @return a unique pointer to the newly created object (`[[nodiscard]]`) + */ +template , bool> = true> +std::unique_ptr make_unique_for_overwrite(const std::size_t n) { + return std::unique_ptr(new std::remove_extent_t[n]); +} + +/** + * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite. + * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique + * @tparam T the type of the object to create + * @tparam Args the types of the constructor arguments + * @param[in] args the arguments to pass to the constructor + * @return a unique pointer to the newly created object (`[[nodiscard]]`) + */ +template , bool> = true> +auto make_unique_for_overwrite(Args &&...args) = delete; + +/** + * @brief Fill the array @p dest with zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset. + * @tparam T the type of the values + * @param[in,out] dest the array to fill with zeros + * @param[in] count the number of values to fill + */ +template +void parallel_zero_memset(T *dest, const std::size_t count) { + PLSSVM_ASSERT(dest != nullptr, "The destination pointer may not be a nullptr!"); + +// initialize the data pointed to by dest to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset +#if defined(_OPENMP) + #pragma omp parallel for + for (std::size_t i = 0; i < count; ++i) { + dest[i] = T{ 0 }; + } +#else + std::memset(dest, 0, count * sizeof(T)); +#endif +} + +} // namespace plssvm::detail + +#endif // PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_ diff --git a/include/plssvm/detail/type_traits.hpp b/include/plssvm/detail/type_traits.hpp index 288f8b80b..e29452955 100644 --- a/include/plssvm/detail/type_traits.hpp +++ b/include/plssvm/detail/type_traits.hpp @@ -24,6 +24,7 @@ #include // std::enable_if_t, std::remove_cv_t, std::remove_reference_t, std::is_same_v, std::false_type, std::true_type, std::is_same_v #include // std::unordered_map, std::unordered_multimap #include // std::unordered_set, std::unordered_multiset +#include // std::variant #include // std::vector namespace plssvm::detail { @@ -357,6 +358,25 @@ struct is_one_type_of { template constexpr bool is_one_type_of_v = is_one_type_of::value; +/** + * @brief Type trait to check whether @p T is a `std::variant`. + * @tparam T the type to check + */ +template +struct is_variant : std::false_type { }; + +/** + * @copybrief plssvm::detail::is_variant + */ +template +struct is_variant> : std::true_type { }; + +/** + * @copybrief plssvm::detail::is_variant + */ +template +constexpr bool is_variant_v = is_variant::value; + } // namespace plssvm::detail #endif // PLSSVM_DETAIL_TYPE_TRAITS_HPP_ diff --git a/include/plssvm/detail/utility.hpp b/include/plssvm/detail/utility.hpp index e81d46ae1..613a571cc 100644 --- a/include/plssvm/detail/utility.hpp +++ b/include/plssvm/detail/utility.hpp @@ -50,6 +50,21 @@ namespace plssvm::detail { +/** + * @brief Shorthand for a more readable `std::visit` overload set. + * @tparam Ts the visited types + */ +template +struct visit_overload : Ts... { + using Ts::operator()...; +}; + +/** + * @brief plssvm::detail::visit_overload + */ +template +visit_overload(Ts...) -> visit_overload; + /** * @brief Invokes undefined behavior. Used to mark code paths that may never be reachable. * @details See: C++23 [`std::unreachable`](https://en.cppreference.com/w/cpp/utility/unreachable) diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp index 3dec0a5c6..b2ac0bb35 100644 --- a/include/plssvm/environment.hpp +++ b/include/plssvm/environment.hpp @@ -18,13 +18,15 @@ #include "plssvm/backend_types.hpp" // plssvm::backend_type, plssvm::list_available_backends #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/cmd/utility.hpp" // plssvm::detail::cmd::filter_argv #include "plssvm/detail/string_utility.hpp" // plssvm::detail::to_lower_case #include "plssvm/detail/utility.hpp" // plssvm::detail::{contains, unreachable} #include "plssvm/exceptions/exceptions.hpp" // plssvm::environment_exception #include "plssvm/mpi/environment.hpp" // plssvm::mpi::{is_initialized, init} -#if defined(PLSSVM_HAS_HPX_BACKEND) +#if defined(PLSSVM_HAS_HPX_BACKEND) || defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HPX) #include "hpx/execution.hpp" // ::hpx::post + #include "hpx/hpx_main.hpp" // disable support for HPX's short command line aliases #include "hpx/hpx_start.hpp" // ::hpx::{start, stop, finalize} #include "hpx/runtime.hpp" // ::hpx::{is_running, is_stopped} #endif @@ -239,7 +241,13 @@ inline void initialize_backend([[maybe_unused]] const backend_type backend, [[ma #endif #if defined(PLSSVM_HAS_KOKKOS_BACKEND) if (backend == backend_type::kokkos) { - Kokkos::initialize(argc, argv); + #if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HPX) + ::hpx::start(nullptr, argc, argv); + #endif + // we have to filter out our "--kokkos_execution_space" command line option or Kokkos itself will issue a warning on the command line + std::vector filtered_argv = plssvm::detail::cmd::filter_argv(argc, argv, { "--kokkos_" }); + int filtered_argc = static_cast(filtered_argv.size()); + Kokkos::initialize(filtered_argc, filtered_argv.data()); } #endif } diff --git a/include/plssvm/mpi/communicator.hpp b/include/plssvm/mpi/communicator.hpp index 886d13427..d0af17b88 100644 --- a/include/plssvm/mpi/communicator.hpp +++ b/include/plssvm/mpi/communicator.hpp @@ -96,6 +96,12 @@ class communicator { */ [[nodiscard]] constexpr static bool is_mpi_enabled() { return PLSSVM_IS_DEFINED(PLSSVM_HAS_MPI_ENABLED); } + /** + * @brief Check whether more than one MPI process is running, i.e., MPI is used to speed-up the computations. + * @return `true` if more than one MPI process is running, otherwise `false` ([[nodiscard]]) + */ + [[nodiscard]] bool is_mpi_parallel() const { return this->size() > std::size_t{ 1 }; } + /** * @brief Returns `true` if the current MPI rank is rank `0`, i.e., the main MPI rank. * @details If `PLSSVM_HAS_MPI_ENABLED` is undefined, returns `true`. diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp index 378c3d3ea..42c9544ff 100644 --- a/include/plssvm/parameter.hpp +++ b/include/plssvm/parameter.hpp @@ -57,8 +57,8 @@ IGOR_MAKE_NAMED_ARGUMENT(solver); IGOR_MAKE_NAMED_ARGUMENT(classification); /// Create a named argument for the SYCL backend specific SYCL implementation type (DPC++ or AdaptiveCpp). IGOR_MAKE_NAMED_ARGUMENT(sycl_implementation_type); -/// Create a named argument for the SYCL backend specific kernel invocation type. -IGOR_MAKE_NAMED_ARGUMENT(sycl_kernel_invocation_type); +/// Create a named argument for the SYCL backend specific data parallel kernels. +IGOR_MAKE_NAMED_ARGUMENT(sycl_data_parallel_kernel); /// Create a named argument for the Kokkos backend specific execution space. IGOR_MAKE_NAMED_ARGUMENT(kokkos_execution_space); @@ -76,13 +76,13 @@ constexpr bool has_only_parameter_named_args_v = !igor::has_other_than( * @brief Trait to check whether @p Args only contains named-parameter that can be used to initialize a `plssvm::parameter` struct including SYCL specific named-parameters. */ template -constexpr bool has_only_sycl_parameter_named_args_v = !igor::has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type); +constexpr bool has_only_sycl_parameter_named_args_v = !igor::has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_data_parallel_kernel); /** * @brief Trait to check whether @p Args only contains SYCL specific named-parameters. */ template -constexpr bool has_only_sycl_named_args_v = !igor::has_other_than(plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type); +constexpr bool has_only_sycl_named_args_v = !igor::has_other_than(plssvm::sycl_implementation_type, plssvm::sycl_data_parallel_kernel); /** * @brief Trait to check whether @p Args only contains named-parameter that can be used to initialize a `plssvm::parameter` struct including Kokkos specific named-parameters. @@ -215,7 +215,7 @@ struct parameter { // compile time check: each named parameter must only be passed once static_assert(!parser.has_duplicates(), "Can only use each named parameter once!"); // compile time check: only some named parameters are allowed - static_assert(!parser.has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type, plssvm::kokkos_execution_space), + static_assert(!parser.has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_data_parallel_kernel, plssvm::kokkos_execution_space), "An illegal named parameter has been passed!"); // shorthand function for emitting a warning if a provided parameter is not used by the current kernel function diff --git a/include/plssvm/solver_types.hpp b/include/plssvm/solver_types.hpp index 3bcbe68f9..83f1e7139 100644 --- a/include/plssvm/solver_types.hpp +++ b/include/plssvm/solver_types.hpp @@ -32,6 +32,8 @@ enum class solver_type { automatic, /** Use the CG algorithm explicitly calculating the kernel matrix and fully storing it on the device. */ cg_explicit, + /** Use the CG algorithm explicitly calculating the kernel matrix and fully storing it on the host. Realized using unified shared memory. */ + cg_streaming, /** Use the CG algorithm implicitly recomputing the kernel matrix each CG iteration (smallest memory footprint). */ cg_implicit }; diff --git a/include/plssvm/svm/csvm.hpp b/include/plssvm/svm/csvm.hpp index 1acd4738e..fc98a1cdc 100644 --- a/include/plssvm/svm/csvm.hpp +++ b/include/plssvm/svm/csvm.hpp @@ -357,6 +357,7 @@ std::tuple, std::vector, std::vectornum_available_devices() }; const std::vector total_memory_needed_explicit_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs); + const std::pair> total_memory_needed_streaming_per_device = data_distribution.calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(num_features, num_rhs); const std::vector total_memory_needed_implicit_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs); // format a vector differentiating between it containing only a single entry or multiple @@ -368,7 +369,7 @@ std::tuple, std::vector, std::vector, std::vector, std::vector(percentual_safety_margin * 100.0L), minimal_safety_margin, detail::tracking::tracking_entry{ "solver", "system_memory", total_system_memory }, @@ -386,11 +388,15 @@ std::tuple, std::vector, std::vector, std::vector, std::vector, std::vector, std::vector failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) { + if (const std::vector failed_cg_streaming_constraints = check_sizes(total_memory_needed_streaming_per_device.second, usable_device_memory_per_device); + total_memory_needed_streaming_per_device.first <= usable_system_memory && failed_cg_streaming_constraints.empty()) { // use the implicit solver type - used_solver = solver_type::cg_implicit; + used_solver = solver_type::cg_streaming; } else { - // not enough device memory available for the implicit case - throw kernel_launch_resources{ fmt::format("Not enough device memory available on device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) }; + if (!comm_.is_mpi_parallel()) { + // output only if a single MPI rank is used + if (!failed_cg_streaming_constraints.empty()) { + detail::log_untracked(verbosity_level::full, + comm_, + "Cannot use cg_streaming due to memory constraints on device(s) {}!\n", + format_vector(failed_cg_streaming_constraints)); + } + if (total_memory_needed_streaming_per_device.first > usable_system_memory) { + // output only if a single MPI rank is used + detail::log_untracked(verbosity_level::full, + comm_, + "Cannot use cg_streaming due to system memory constraints!\n"); + } + } + + // check whether there is enough memory available for cg_implicit + if (const std::vector failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) { + // use the implicit solver type + used_solver = solver_type::cg_implicit; + } else { + // not enough device memory available for the implicit case + throw kernel_launch_resources{ fmt::format("Not enough device memory available on device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) }; + } } } @@ -436,22 +464,27 @@ std::tuple, std::vector, std::vector max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); + const std::vector max_single_allocation_cg_streaming_size_per_device = data_distribution.calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); const std::vector max_single_allocation_cg_implicit_size_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs); // output the maximum memory allocation size per device - if (comm_.size() <= 1) { + if (!comm_.is_mpi_parallel()) { // output only if a single MPI rank is used detail::log_untracked(verbosity_level::full, comm_, + "\n" " - maximum supported single memory allocation size: {}\n" " - maximum needed single memory allocation size (cg_explicit): {}\n" + " - maximum needed single memory allocation size (cg_streaming): {}\n" " - maximum needed single memory allocation size (cg_implicit): {}\n", format_vector(max_mem_alloc_size_per_device), format_vector(max_single_allocation_cg_explicit_size_per_device), + format_vector(max_single_allocation_cg_streaming_size_per_device), format_vector(max_single_allocation_cg_implicit_size_per_device)); } PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_single_mem_alloc_size", max_mem_alloc_size_per_device })); PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_explicit", max_single_allocation_cg_explicit_size_per_device })); + PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_streaming", max_single_allocation_cg_streaming_size_per_device })); PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_implicit", max_single_allocation_cg_implicit_size_per_device })); // check whether the maximum single memory allocation sizes per device can be satisfied @@ -459,20 +492,32 @@ std::tuple, std::vector, std::vector failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device); used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) { // max mem alloc size constraints not fulfilled - if (comm_.size() <= 1) { + if (!comm_.is_mpi_parallel()) { // output only if a single MPI rank is used detail::log_untracked(verbosity_level::full, comm_, - "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n", + "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_streaming.\n", format_vector(failed_cg_explicit_constraints)); } // can't use cg_explicit + used_solver = solver_type::cg_streaming; + } + if (const std::vector failed_cg_streaming_constraints = check_sizes(max_single_allocation_cg_streaming_size_per_device, max_mem_alloc_size_per_device); + used_solver == solver_type::cg_streaming && !failed_cg_streaming_constraints.empty()) { + // max mem alloc size constraints not fulfilled + if (!comm_.is_mpi_parallel()) { + detail::log_untracked(verbosity_level::full, + comm_, + "Cannot use cg_streaming due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n", + format_vector(failed_cg_streaming_constraints)); + } + // can't use cg_streaming used_solver = solver_type::cg_implicit; } if (const std::vector failed_cg_implicit_constraints = check_sizes(max_single_allocation_cg_implicit_size_per_device, max_mem_alloc_size_per_device); used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) { // can't fulfill maximum single memory allocation size even for cg_implicit - if (comm_.size() <= 1) { + if (!comm_.is_mpi_parallel()) { // output only if a single MPI rank is used plssvm::detail::log_untracked(verbosity_level::full | verbosity_level::warning, comm_, @@ -484,7 +529,7 @@ std::tuple, std::vector, std::vector(backends_to_initialize); + environment_guard = std::make_unique(argc, argv, backends_to_initialize); // create default csvm const std::unique_ptr svm = [&]() { if (use_sycl_as_backend) { - return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type); + return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_data_parallel_kernel = cmd_parser.sycl_data_parallel_kernel); } else if (use_kokkos_as_backend) { return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space); } else { diff --git a/src/main_train.cpp b/src/main_train.cpp index cf4893946..7811b4e82 100644 --- a/src/main_train.cpp +++ b/src/main_train.cpp @@ -158,12 +158,12 @@ int main(int argc, char *argv[]) { if (use_kokkos_as_backend) { backends_to_initialize.push_back(plssvm::backend_type::kokkos); } - environment_guard = std::make_unique(backends_to_initialize); + environment_guard = std::make_unique(argc, argv, backends_to_initialize); // create SVM const std::unique_ptr svm = [&]() { if (use_sycl_as_backend) { - return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type); + return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_data_parallel_kernel = cmd_parser.sycl_data_parallel_kernel); } else if (use_kokkos_as_backend) { return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space); } else { diff --git a/src/plssvm/backends/CUDA/csvm.cu b/src/plssvm/backends/CUDA/csvm.cu index 93db10d36..5328eb840 100644 --- a/src/plssvm/backends/CUDA/csvm.cu +++ b/src/plssvm/backends/CUDA/csvm.cu @@ -164,7 +164,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const { // fit // //***************************************************// -auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { const unsigned long long num_rows_reduced = data_d.shape().x - 1; const unsigned long long num_features = data_d.shape().y; const queue_type &device = devices_[device_id]; @@ -179,7 +179,10 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_); const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id); - device_ptr_type kernel_matrix_d{ num_entries_padded, device }; // only explicitly store the upper triangular matrix + // only store the upper triangular matrix + // if solver == solver_type::cg_explicit: store it explicitly + // if solver == solver_type::cg_streaming: store it using USM + device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations }; const real_type cost_factor = real_type{ 1.0 } / params.cost; // convert execution range block to CUDA's native dim3 diff --git a/src/plssvm/backends/CUDA/detail/device_ptr.cu b/src/plssvm/backends/CUDA/detail/device_ptr.cu index 5d7ba74bb..00f20f66e 100644 --- a/src/plssvm/backends/CUDA/detail/device_ptr.cu +++ b/src/plssvm/backends/CUDA/detail/device_ptr.cu @@ -25,21 +25,25 @@ namespace plssvm::cuda::detail { template -device_ptr::device_ptr(const size_type size, const queue_type device) : - device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { } +device_ptr::device_ptr(const size_type size, const queue_type device, const bool use_usm_allocations) : + device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const queue_type device) : - device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { } +device_ptr::device_ptr(const plssvm::shape shape, const queue_type device, const bool use_usm_allocations) : + device_ptr{ shape, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device) : - base_type{ shape, padding, device } { - if (queue_ < 0 || queue_ >= static_cast(get_device_count())) { +device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device, const bool use_usm_allocations) : + base_type{ shape, padding, device, use_usm_allocations } { + if (queue_ < 0 || queue_ >= get_device_count()) { throw backend_exception{ fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", get_device_count(), queue_) }; } detail::set_device(queue_); - PLSSVM_CUDA_ERROR_CHECK(cudaMalloc(&data_, this->size_padded() * sizeof(value_type))) + if (use_usm_allocations_) { + PLSSVM_CUDA_ERROR_CHECK(cudaMallocManaged(&data_, this->size_padded() * sizeof(value_type))) + } else { + PLSSVM_CUDA_ERROR_CHECK(cudaMalloc(&data_, this->size_padded() * sizeof(value_type))) + } this->memset(0); } diff --git a/src/plssvm/backends/HIP/csvm.hip b/src/plssvm/backends/HIP/csvm.hip index 312ad3122..fc2d4e5b7 100644 --- a/src/plssvm/backends/HIP/csvm.hip +++ b/src/plssvm/backends/HIP/csvm.hip @@ -180,7 +180,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const { // fit // //***************************************************// -auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { +auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter ¶ms, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type { const unsigned long long num_rows_reduced = data_d.shape().x - 1; const unsigned long long num_features = data_d.shape().y; const queue_type &device = devices_[device_id]; @@ -195,7 +195,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_); const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id); - device_ptr_type kernel_matrix_d{ num_entries_padded, device }; // only explicitly store the upper triangular matrix + // if solver == solver_type::cg_explicit: store it explicitly + // if solver == solver_type::cg_streaming: store it using USM + device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations }; const real_type cost_factor = real_type{ 1.0 } / params.cost; // convert execution range block to HIP's native dim3 diff --git a/src/plssvm/backends/HIP/detail/device_ptr.hip b/src/plssvm/backends/HIP/detail/device_ptr.hip index 560783097..c958c73fd 100644 --- a/src/plssvm/backends/HIP/detail/device_ptr.hip +++ b/src/plssvm/backends/HIP/detail/device_ptr.hip @@ -29,21 +29,25 @@ namespace plssvm::hip::detail { template -device_ptr::device_ptr(const size_type size, const queue_type device) : - device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { } +device_ptr::device_ptr(const size_type size, const queue_type device, const bool use_usm_allocations) : + device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const queue_type device) : - device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { } +device_ptr::device_ptr(const plssvm::shape shape, const queue_type device, const bool use_usm_allocations) : + device_ptr{ shape, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { } template -device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device) : - base_type{ shape, padding, device } { +device_ptr::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device, const bool use_usm_allocations) : + base_type{ shape, padding, device, use_usm_allocations } { if (queue_ < 0 || queue_ >= static_cast(get_device_count())) { throw backend_exception{ fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", get_device_count(), queue_) }; } detail::set_device(queue_); - PLSSVM_HIP_ERROR_CHECK(hipMalloc(&data_, this->size_padded() * sizeof(value_type))) + if (use_usm_allocations_) { + PLSSVM_HIP_ERROR_CHECK(hipMallocManaged(&data_, this->size_padded() * sizeof(value_type))) + } else { + PLSSVM_HIP_ERROR_CHECK(hipMalloc(&data_, this->size_padded() * sizeof(value_type))) + } this->memset(0); } diff --git a/src/plssvm/backends/HPX/csvm.cpp b/src/plssvm/backends/HPX/csvm.cpp index 71f651688..24f1c3d70 100644 --- a/src/plssvm/backends/HPX/csvm.cpp +++ b/src/plssvm/backends/HPX/csvm.cpp @@ -18,6 +18,7 @@ #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution #include "plssvm/detail/logging/mpi_log_untracked.hpp" // plssvm::detail::log_untracked +#include "plssvm/detail/make_unique_for_overwrite.hpp" // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset} #include "plssvm/detail/memory_size.hpp" // plssvm::detail::memory_size #include "plssvm/detail/move_only_any.hpp" // plssvm::detail::{move_only_any, move_only_any_cast} #include "plssvm/detail/tracking/performance_tracker.hpp" // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY @@ -113,6 +114,7 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const // unreachable break; case solver_type::cg_explicit: + case solver_type::cg_streaming: { // calculate the number of data points this device is responsible for const std::size_t device_specific_num_rows = dist.place_specific_num_rows(0); @@ -120,26 +122,33 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const // get the offset of the data points this device is responsible for const std::size_t row_offset = dist.place_row_offset(0); - std::vector kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0)); // only explicitly store the upper triangular matrix + // get the number of kernel matrix entries + const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0); + + // only explicitly store the upper triangular matrix + auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite(num_entries); + // initialize kernel matrix to all zeros in parallel + ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries); + const auto start = std::chrono::steady_clock::now(); switch (params.kernel_type) { case kernel_function_type::linear: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost); break; case kernel_function_type::polynomial: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get(params.gamma), params.coef0); break; case kernel_function_type::rbf: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case kernel_function_type::sigmoid: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma), params.coef0); break; case kernel_function_type::laplacian: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; case kernel_function_type::chi_squared: - detail::device_kernel_assembly(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); + detail::device_kernel_assembly(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get(params.gamma)); break; } const auto end = std::chrono::steady_clock::now(); @@ -199,17 +208,18 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s // unreachable break; case solver_type::cg_explicit: + case solver_type::cg_streaming: { - const auto &explicit_A = ::plssvm::detail::move_only_any_cast &>(A.front()); - PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!"); + const auto &explicit_A = ::plssvm::detail::move_only_any_cast &>(A.front()); + PLSSVM_ASSERT(explicit_A != nullptr, "The A matrix must not be empty!"); const auto start = std::chrono::steady_clock::now(); - detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C); + detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C); const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows; if (num_mirror_rows > std::size_t{ 0 }) { - detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C); + detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C); } const auto end = std::chrono::steady_clock::now(); @@ -261,6 +271,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s }); // wait until operation is completed wait.get(); + // restore padding entries by setting them to zero + C.restore_padding(); } //***************************************************// @@ -317,6 +329,8 @@ aos_matrix csvm::predict_values(const parameter ¶ms, [[maybe_unused]] const auto duration = std::chrono::duration_cast(end - start); PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration })); } + // restore padding entries by setting them to zero + w.restore_padding(); // reduce w on all MPI ranks comm_.allreduce_inplace(w); @@ -358,6 +372,9 @@ aos_matrix csvm::predict_values(const parameter ¶ms, }); // wait until operation is completed wait.get(); + + // restore padding entries by setting them to zero + out.restore_padding(); return out; } diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt index 371991c1f..818cb4794 100644 --- a/src/plssvm/backends/Kokkos/CMakeLists.txt +++ b/src/plssvm/backends/Kokkos/CMakeLists.txt @@ -28,6 +28,7 @@ set(PLSSVM_KOKKOS_SOURCES ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp + ${CMAKE_CURRENT_LIST_DIR}/memory_space.cpp ) # set target properties diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp index 2bf512433..e41aa14f4 100644 --- a/src/plssvm/backends/Kokkos/csvm.cpp +++ b/src/plssvm/backends/Kokkos/csvm.cpp @@ -20,7 +20,7 @@ #include "plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp" // plssvm::kokkos::detail::device_kernel_assembly #include "plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp" // plssvm::kokkos::detail::device_kernel_assembly_symm #include "plssvm/backends/Kokkos/kernel/predict_kernel.hpp" // plssvm::kokkos::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict} -#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE +#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT #include "plssvm/detail/data_distribution.hpp" // plssvm::detail::triangular_data_distribution #include "plssvm/detail/logging/log_untracked.hpp" // plssvm::detail::log_untracked @@ -51,9 +51,11 @@ #include // std::numeric_limits::max #include // std::map #include // std::string -#include // std::move +#include // std::move, std::forward #include // std::vector +namespace { + // a dummy class used as functor to the team_size_max function template struct dummy { @@ -61,6 +63,114 @@ struct dummy { void operator()(const typename Kokkos::TeamPolicy::member_type &) const { } }; +/** + * @brief Run the kernel functor on the given device. + * @tparam KernelFunctor the type of the kernel functor to run + * @tparam Args the types of the parameters necessary for the specific kernel functor + * @param[in] partial_grid the number of work-groups in each dimension of the execution grid + * @param[in] block the number of work-items in each dimension per work-group + * @param[in] args the parameters necessary for the specific kernel functor + */ +template +void run_kernel_functor(const std::string &kernel_name, const TeamPolicy &policy, Args &&...args) { + Kokkos::parallel_for(kernel_name, policy, KernelFunctor{ std::forward(args)... }); +} + +/** + * @brief Dispatch the kernel functor to the correct kernel function type. + * @tparam KernelFunctor the type of the kernel functor to run + * @tparam ExecutionSpace the used Kokkos execution space + * @tparam USMEnabledMemorySpace the possibly USM enabled Kokkos memory space + * @tparam target the target platform to run the kernel on + * @tparam Args the types of the parameters necessary for the specific kernel functor + * @param[in] params the parameters used to determine the kernel function type + * @param[in] args the parameters necessary for the specific kernel functor + */ +template