diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b5c16f86..97edf2f5f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,14 +78,15 @@ endif ()
 # set base sources
 set(PLSSVM_BASE_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/Kokkos/execution_space.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/data_parallel_kernels.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/implementation_types.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/kernel_invocation_types.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/stdpar/implementation_types.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/execution_range.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/data_set/min_max_scaler.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/parser_predict.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/parser_scale.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/parser_train.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/io/file_reader.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/data_distribution.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/memory_size.cpp
@@ -638,6 +639,37 @@ if (PLSSVM_ENABLE_LTO)
     endif ()
 endif ()
 
+########################################################################################################################
+#                          enable the requested vectorization widths for the auto-vectorizers                          #
+########################################################################################################################
+# GCC and clang both do not automatically auto-vectorize for AVX-512 (only AVX2)
+# -> enable it if "cpu:avx512" was passed as PLSSVM_TARGET_PLATFORMS
+if (PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1)
+    if (${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx512")
+        message(STATUS "Enabling AVX512 support for the auto-vectorizers (-mprefer-vector-width=512).")
+        target_compile_options(
+                ${PLSSVM_BASE_LIBRARY_NAME}
+                PUBLIC $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang,IntelLLVM>:-mprefer-vector-width=512>>
+        )
+    elseif (${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx2" OR ${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx")
+        message(STATUS "Enabling AVX/AVX2 support for the auto-vectorizers (-mprefer-vector-width=256).")
+        target_compile_options(
+                ${PLSSVM_BASE_LIBRARY_NAME}
+                PUBLIC $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang,IntelLLVM>:-mprefer-vector-width=256>>
+        )
+    elseif (${PLSSVM_CPU_TARGET_ARCHS} MATCHES "^sse")
+        message(STATUS "Enabling SSE for the auto-vectorizers (-mprefer-vector-width=128).")
+        target_compile_options(
+                ${PLSSVM_BASE_LIBRARY_NAME}
+                PUBLIC $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang,IntelLLVM>:-mprefer-vector-width=128>>
+        )
+    else ()
+        message(FATAL_ERROR "Unrecognized CPU target architecture \"${PLSSVM_CPU_TARGET_ARCHS}\". Allowed values are: avx512, avx2, avx, sse.")
+    endif ()
+else ()
+    # automatically use the "optimal" auto-vectorizer width
+endif ()
+
 ########################################################################################################################
 #                                     check for optional and necessary dependencies                                    #
 ########################################################################################################################
@@ -914,16 +946,16 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME})
 choose the SYCL implementation to be used in the SYCL backend: ${PLSSVM_SYCL_BACKEND_NAME_LIST} (default: automatic)
 "
     )
-    string(REPLACE ";" "|" PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST "${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST}")
-    set(PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_MANPAGE_ENTRY
+    string(REPLACE ";" "|" PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST "${PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST}")
+    set(PLSSVM_SYCL_DATA_PARALLEL_KERNEL_MANPAGE_ENTRY
         "
 .TP
-.B --sycl_kernel_invocation_type
-choose the kernel invocation type when using SYCL as backend: ${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST} (default: automatic)
+.B --sycl_data_parallel_kernel
+choose the data parallel kernel when using SYCL as backend: ${PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST} (default: automatic)
 "
     )
 endif ()
-set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_MANPAGE_ENTRY}${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}")
+set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_DATA_PARALLEL_KERNEL_MANPAGE_ENTRY}${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}")
 # assemble the Kokkos manpage entry
 if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME})
     string(REPLACE ";" "|" PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}")
diff --git a/README.md b/README.md
index c764ed4d1..cb0efd1a6 100644
--- a/README.md
+++ b/README.md
@@ -31,38 +31,38 @@
 A [Support Vector Machine (SVM)](https://en.wikipedia.org/wiki/Support-vector_machine) is a supervised machine learning model.
 In its basic form SVMs are used for binary classification tasks.
 Their fundamental idea is to learn a hyperplane which separates the two classes best, i.e., where the widest possible margin around its decision boundary is free of data.
-This is also the reason, why SVMs are also called "large margin classifiers".
+This is also the reason, why SVMs are also called "large margin classifiers."
 To predict to which class a new, unseen data point belongs, the SVM simply has to calculate on which side of the previously calculated hyperplane the data point lies.
-This is very efficient since it only involves a single scalar product of the size corresponding to the numer of features of the data set.
+This is very efficient since it only involves a single scalar product of the size corresponding to the numer of features per data point in the data set.
 
 <p align="center">
   <img alt="Basic idea of an Support Vector Machine as classification model." src="https://github.com/SC-SGS/PLSSVM/raw/main/.figures/support_vector_machine.png" width="50%">
 </p>
 
-However, normal SVMs suffer in their potential parallelizability.
+However, normal SVMs suffer from their potential parallelizability.
 Determining the hyperplane boils down to solving a convex quadratic problem.
 For this, most SVM implementations use Sequential Minimal Optimization (SMO), an inherently sequential algorithm.
 The basic idea of this algorithm is that it takes a pair of data points and calculates the hyperplane between them.
 Afterward, two new data points are selected and the existing hyperplane is adjusted accordingly.
-This procedure is repeat until a new adjustment would be smaller than some epsilon greater than zero.
+This procedure is repeated until a new adjustment would be smaller than some epsilon greater than zero.
 
 Some SVM implementations try to harness some parallelization potential by not drawing point pairs but group of points.
 In this case, the hyperplane calculation inside this group is parallelized.
-However, even then modern highly parallel hardware can not be utilized efficiently.
+However, even then, modern highly parallel hardware cannot be utilized efficiently.
 
 Therefore, we implemented a version of the original proposed SVM called [Least Squares Support Vector Machine (LS-SVM)](https://en.wikipedia.org/wiki/Least-squares_support-vector_machine).
 The LS-SVMs reformulated the original problem such that it boils down to solving a system of linear equations.
-For this kind of problem many highly parallel algorithms and implementations are known.
+For this kind of problem, many highly parallel algorithms and implementations are known.
 We decided to use the [Conjugate Gradient (CG)](https://en.wikipedia.org/wiki/Conjugate_gradient_method) to solve the system of linear equations.
 
 The main highlights of our SVM implementations are:
 1. Drop-in replacement for LIBSVM's `svm-train`, `svm-predict`, and `svm-scale` (some features currently not implemented).
-2. Support of multiple different programming frameworks for parallelization (also called backends in our PLSSVM implementation) which allows us to target GPUs and CPUs from different vendors like NVIDIA, AMD, or Intel:
+2. Support for multiple different programming frameworks for parallelization (also called backends in our PLSSVM implementation) which allows us to support GPUs and CPUs from different vendors like NVIDIA, AMD, or Intel:
    - [OpenMP](https://www.openmp.org/)
    - [HPX](https://hpx.stellar-group.org/) (tested with current master)
    - C++ 17's standard parallelism [stdpar](https://en.cppreference.com/w/cpp/algorithm):<br>
      **Note**: due to the nature of the used USM mechanics in the `stdpar` implementations, the `stdpar` backend **can't** be enabled together with **any** other backend! <br>
-     **Note**: since every translation units need to be compiled with the same flag, we currently globally set `CMAKE_CXX_FLAGS` although it's discouraged.
+     **Note**: since every translation unit needs to be compiled with the same flag, we currently globally set `CMAKE_CXX_FLAGS` although it's discouraged.
      - [nvc++](https://developer.nvidia.com/hpc-sdk) from NVIDIA's HPC SDK (tested with version [25.3](https://docs.nvidia.com/hpc-sdk/hpc-sdk-release-notes/index.html))
      - [roc-stdpar](https://github.com/ROCm/roc-stdpar) merged into upstream LLVM starting with version 18 (tested with version [18](https://releases.llvm.org/))
      - [icpx](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler.html) as Intel's oneAPI compiler (tested with version [2025.0.0](https://www.intel.com/content/www/us/en/developer/articles/release-notes/oneapi-dpcpp/2025.html))
@@ -82,8 +82,9 @@ The main highlights of our SVM implementations are:
    - sigmoid: $\tanh(\gamma$ $\cdot$ $\vec{u}^T$ $\cdot$ $\vec{v}$ $+$ $coef0)$
    - laplacian: $\exp(-\gamma$ $\cdot |$ $\vec{u}$ $-$ $\vec{v}$ $|_1)$
    - chi-squared (only well-defined for values > 0): $\exp(-\gamma \cdot \sum_i \frac{(x[i] - y[i])^2}{x[i] + y[i]})$
-4. Two different solver types for a trade-off between memory footprint and runtime:
-   - `cg_explicit`: large memory overhead but very fast
+4. Three different solver types for a trade-off between memory footprint and runtime:
+   - `cg_explicit`: large memory overhead but fast
+   - `cg_streaming`: the respective runtime automatically handles the memory migrations but may reduce the performance (implemented via unified shared memory)
    - `cg_implicit`: slower but requires drastically less memory
 5. Multi-class classification available via one vs. all (also one vs. rest or OAA) and one vs. one (also OAO):
    - OAA: one huge classification task where our CG algorithm solves a system of linear equations with multiple right-hand sides. The resulting model file is **not** compatible with LIBSVM.
@@ -110,7 +111,7 @@ General dependencies:
 - [Pybind11 ≥ v2.13.6](https://github.com/pybind/pybind11) if Python bindings are enabled
 - [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up library utilities (like file parsing)
 - [MPI](https://www.mpi-forum.org/) if distributed memory systems should be supported; [mpi4py](https://mpi4py.readthedocs.io/en/stable/) to enable interoperability in our Python bindings
-- [Format.cmake](https://github.com/TheLartians/Format.cmake) if auto formatting via cmake-format and clang-format is enabled; also requires at least clang-format-18 and git, additionally, needs our custom [cmake-format fork](https://github.com/vancraar/cmake_format) incorporating some patches
+- [Format.cmake](https://github.com/TheLartians/Format.cmake) if auto formatting via cmake-format and clang-format is enabled; it also requires at least clang-format-18 and git, additionally, needs our custom [cmake-format fork](https://github.com/vancraar/cmake_format) incorporating some patches
 - multiple Python modules used in the utility scripts, to install all modules use `pip install --user -r install/python_requirements.txt`
 
 Additional dependencies for the OpenMP backend:
@@ -297,8 +298,15 @@ The `[optional_options]` can be one or multiple of:
 - `PLSSVM_ENABLE_FAST_MATH=ON|OFF` (default depending on `CMAKE_BUILD_TYPE`: `ON` for Release or RelWithDebInfo, `OFF` otherwise): enable `fast-math` compiler flags for all backends
 - `PLSSVM_ENABLE_ASSERTS=ON|OFF` (default: `OFF`): enables custom assertions
 - `PLSSVM_USE_FLOAT_AS_REAL_TYPE=ON|OFF` (default: `OFF`): use `float` as real_type instead of `double`
-- `PLSSVM_THREAD_BLOCK_SIZE` (default: `8`): set a specific thread block size used in the GPU kernels (for fine-tuning optimizations)
-- `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `4`): set a specific internal block size used in the GPU kernels (for fine-tuning optimizations)
+- `PLSSVM_THREAD_BLOCK_SIZE` (default: `8`): set a specific thread block size used in the kernels (for fine-tuning optimizations) <br>
+   **Note**: for the different execution spaces in the Kokkos backend, the maximum value of the `PLSSVM_THREAD_BLOCK_SIZE` is not as straight forward as one may wish:
+  - CUDA, HIP, and SYCL: the maximum value depends on the underlying backend (in practice $\sqrt{1024}$ = 32)
+  - HPX and Serial: must **exactly** be 1
+  - OpenMP: must be 1 or 2 (most likely only 1 will work)
+  - Threads: must be 1; however, note that Kokkos itself **must** be built with hwloc support (via `-DKokkos_ENABLE_HWLOC=ON`), otherwise the Kokkos::Threads execution space will always only use a single core
+  - OpenMPTarget: $\sqrt{256}$ = 16
+  - OpenACC: $\lfloor\sqrt{512}\rfloor$ = 22
+- `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `4`): set a specific internal block size used in the kernels (for fine-tuning optimizations)
 - `PLSSVM_ENABLE_LTO=ON|OFF` (default: `OFF`): enable interprocedural optimization (IPO/LTO) if supported by the compiler
 - `PLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=ON|OFF` (default: `ON`): enforce the maximum (device) memory allocation size for the plssvm::solver_type::automatic solver
 - `PLSSVM_ENABLE_PINNED_MEMORY=ON|OFF` (default: `OFF`): use host pinned memory for the input matrix when assembling the kernel matrix, if available
@@ -347,7 +355,7 @@ If the SYCL backend is available, additional options can be set.
   - `AUTO`: check for DPC++/icpx as implementation for the SYCL backend but **do not** fail if not available
   - `OFF`: do not check for DPC++/icpx as implementation for the SYCL backend
 
-- `PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS` (default: `ON`): enable SYCL's `hierarchical` and AdaptiveCpp's `scoped` kernel invocation types
+- `PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS` (default: `ON`): enable SYCL's `hierarchical` data parallel kernel and AdaptiveCpp's `scoped` parallelism 
 
 To use DPC++/icpx for SYCL, simply set the `CMAKE_CXX_COMPILER` to the respective DPC++/icpx clang executable during CMake invocation.
 
@@ -355,7 +363,7 @@ If the SYCL implementation is DPC++/icpx the following additional options are av
 
 - `PLSSVM_SYCL_BACKEND_DPCPP_USE_LEVEL_ZERO` (default: `ON`): use DPC++/icpx's Level-Zero backend instead of its OpenCL backend **(only available if a CPU or Intel GPU is targeted)**
 
-If the SYCL implementation is AdaptiveCpp the following additional option is available:
+If the SYCL implementation is AdaptiveCpp, the following additional option is available:
 
 - `PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP` (default: `ON`): use AdaptiveCpp's new SSCP compilation flow
 
@@ -497,7 +505,7 @@ Our `cmake-format` can be installed via:
 pip install "git+https://github.com/vancraar/cmake_format@master"
 ```
 
-To check whether formatting changes must be applied use: 
+To check whether formatting changes must be applied, one can use: 
 
 ```bash
 cmake --build . --target check-cmake-format
@@ -519,7 +527,7 @@ If doxygen is installed and `PLSSVM_ENABLE_DOCUMENTATION` is set to `ON` the doc
 cmake --build . -- doc
 ```
 
-The documentation of the current state of the main branch can be found [here](https://sc-sgs.github.io/PLSSVM/).
+The documentation of the current main branch can be found [here](https://sc-sgs.github.io/PLSSVM/).
 
 ### Installing
 
@@ -550,13 +558,13 @@ export PYTHONPATH=${CMAKE_INSTALL_PREFIX}/lib:${CMAKE_INSTALL_PREFIX}/lib64:${PY
 
 #### Install via pip
 
-We also support a pip packages that can be used to install our library: 
+We also support a pip package that can be used to install our library: 
 
 ```bash
 pip install plssvm
 ```
 
-This pip install behaves **as if** the CMake `all_python` preset is used. 
+This pip installation behaves **as if** the CMake `all_python` preset is used. 
 This means that the `PLSSVM_TARGET_PLATFORMS` are automatically determined and PLSSVM is build with all supported 
 backends that available on the target machine at the point of the `pip install plssvm` invocation. 
 To check the installation, including, e.g., the installed backends, we provide the `plssvm-install-check` command after 
@@ -588,13 +596,17 @@ Issues: https://github.com/SC-SGS/PLSSVM/issues
 
 PLSSVM provides three executables: `plssvm-train`, `plssvm-predict`, and `plssvm-scale`.
 In addition, PLSSVM can also be used as a library in third-party code.
-For more information, see the respective `man` pages which are installed via `cmake --build . -- install`.
+For more information, see the respective `man` pages which are installed via `cmake --build . -- install`. 
+
+We support the command line options of the third-party libraries [HPX](https://hpx.stellar-group.org/) and [Kokkos](https://github.com/kokkos/kokkos) 
+by forwarding the command line options to the respective initialization functions. 
+Internally, these options are filtered out before they are passed to our command line parser utility. 
 
 ### Generating Artificial Data
 
 The repository comes with a Python3 script (in the `utility_scripts/` directory) to simply generate arbitrarily large classification and regression data sets.
 
-In order to use all functionality, the following Python3 modules must be installed:
+To use all functionality, the following Python3 modules must be installed:
 [`argparse`](https://docs.python.org/3/library/argparse.html), [`timeit`](https://docs.python.org/3/library/timeit.html), 
 [`numpy`](https://pypi.org/project/numpy/), [`pandas`](https://pypi.org/project/pandas/),
 [`sklearn`](https://scikit-learn.org/stable/), [`arff`](https://pypi.org/project/arff/),
@@ -643,7 +655,7 @@ optional arguments:
 
 ```
 
-An example invocation generating a classification data set consisting of blobs with 1000 data points with 200 features each and 
+An example invocation generating a classification data set consisting of blobs with 1000 data points with 200 features and 
 4 classes could look like:
 
 ```bash
@@ -682,12 +694,12 @@ Usage:
   -c, --cost arg                set the parameter C (default: 1)
   -e, --epsilon arg             set the tolerance of termination criterion (default: 1e-10)
   -i, --max_iter arg            set the maximum number of CG iterations (default: num_features)
-  -l, --solver arg              choose the solver: automatic|cg_explicit|cg_implicit (default: automatic)
+  -l, --solver arg              choose the solver: automatic|cg_explicit|cg_streaming|cg_implicit (default: automatic)
   -a, --classification arg      the classification strategy to use for multi-class classification: oaa|oao (default: oaa)
   -b, --backend arg             choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic)
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
-      --sycl_kernel_invocation_type arg
-                                choose the kernel invocation type when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic)
+      --sycl_data_parallel_kernel arg
+                                choose the data parallel kernel when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic)
       --sycl_implementation_type arg
                                 choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic)
       --kokkos_execution_space arg
@@ -709,7 +721,7 @@ Usage:
 The help message only print options available based on the CMake invocation. 
 For example, if CUDA was not available during the build step, it will not show up as possible backend in the description of the `--backend` option.
 
-The most minimal example invocation is:
+The most minimal example of an invocation is:
 
 ```bash
 ./plssvm-train /path/to/data_file
@@ -734,7 +746,7 @@ The `--backend=automatic` option works as follows:
 - otherwise, if the `gpu_intel` target is available, check for existing backends in order `sycl` 🠦 `opencl` 🠦 `kokkos` 🠦 `stdpar`
 - otherwise, if the `cpu` target is available, check for existing backends in order `sycl` 🠦 `kokkos` 🠦 `opencl` 🠦 `openmp` 🠦 `hpx` 🠦 `stdpar`
 
-Note that during CMake configuration it is guaranteed that at least one of the above combinations does exist.
+Note that during CMake configuration, it is guaranteed that at least one of the above combinations does exist.
 
 The `--target_platform=automatic` option works for the different backends as follows:
 
@@ -747,18 +759,19 @@ The `--target_platform=automatic` option works for the different backends as fol
 - `Kokkos`: checks which execution spaces are available and which target platforms they support and then tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU
 - `stdpar`: target device must be selected at compile time (using `PLSSVM_TARGET_PLATFORMS`) or using environment variables at runtime
 
-The `--sycl_kernel_invocation_type` and `--sycl_implementation_type` flags are only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`.
-If the `--sycl_kernel_invocation_type` is `automatic`, the `work_group` invocation type is currently always used.
+The `--sycl_data_parallel_kernel` and `--sycl_implementation_type` flags are only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`.
+If the `--sycl_data_parallel_kernel` is `automatic`, the `work_group` data parallel kernels are currently always used.
 If the `--sycl_implementation_type` is `automatic`, the used SYCL implementation is determined by the `PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` CMake flag.
 If the `--kokkos_execution_space` is `automatic`, uses the best fitting execution space based on the provided and/or available target platforms.
 
 ### Predicting using `plssvm-predict`
 
-Our predict utility is fully conform to LIBSVM's model files. 
+Our `plssvm-predict` utility is fully conforming to LIBSVM's model files. 
 This means that our `plssvm-predict` can be used on model files learned with, e.g., LIBSVM's `svm-train`. 
 Note: this is not the case for the regression task since the `svm_type` filed mismatch between LIBSVM (`epsilon_svr`) 
-and PLSSVM (`c_svr`). To automatically convert between the two, simply use the `convert_model.py` script 
-(in the `utility_scripts/` directory) which simply replaces these fields with the respectively expected one 
+and PLSSVM (`c_svr`). 
+To automatically convert between the two, the `convert_model.py` script (in the `utility_scripts/` directory) 
+can be used which simply replaces these fields with the respectively expected one 
 (note that for large files doing that manually may be faster):
 
 ```bash
@@ -796,8 +809,8 @@ Usage:
 
   -b, --backend arg             choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic)
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
-      --sycl_kernel_invocation_type arg
-                                choose the kernel invocation type when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic)
+      --sycl_data_parallel_kernel arg
+                                choose the data parallel kernel when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic)
       --sycl_implementation_type arg
                                 choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic)
       --kokkos_execution_space arg
@@ -872,7 +885,7 @@ An example invocation to scale a train and test file in the same way looks like:
 ### Distributed Memory Support via MPI
 
 We support distributed memory via MPI for `plssvm-train` and `plssvm-predict` while simultaneously allowing multiple devices per MPI rank.
-In order to use it, MPI must be found during the CMake configuration step.
+To use MPI, it must be found during the CMake configuration step.
 Note that if MPI couldn't be found, PLSSVM still works in shared memory mode only and internally disables all MPI related functionality.
 For example, to run PLSSVM via MPI on four nodes simply use the normal `mpirun` command:
 
@@ -892,17 +905,17 @@ Note that the number of provided load balancing weights must be equal to the use
 If one MPI rank has more than one device, all these devices on one MPI rank compute the same number of matrix elements. 
 
 Our MPI implementation, however, currently has some limitations:
-- the training, test, and model data is fully read by **every** MPI rank
-- the training, test, and model data is fully stored on **each** compute device on **every** MPI rank
+- **every** MPI rank fully reads the training, test, and model data
+- **each** compute device on **every** MPI rank fully stores the training, test, and model data
 - **only** the kernel matrix is really divided across **all** MPI ranks
 - while the expensive BLAS level 3 operations in the CG algorithm are computed in a distributed way, everything else is computed on **every** MPI rank
-- in the CG algorithm we communicate the whole matrix, although it would be sufficient to communicate only matrix parts
+- in the CG algorithm we communicate the whole matrix, although it would be enough to communicate only matrix parts
 - **only** the **main** MPI rank (per default rank 0) writes the output files
 - `plssvm-scale` **does not** support more than one MPI rank
 
 ### Example Code for PLSSVM Used as a Library
 
-A simple C++ program (`main_classification.cpp`) using PLSSVM as library for classification could look like:
+A simple C++ program (`main_classification.cpp`) using PLSSVM as a library for classification could look like:
 
 ```cpp
 #include "plssvm/core.hpp"
@@ -940,7 +953,7 @@ int main() {
         const std::vector<int> &correct_label = test_data.labels().value();
         std::cout << plssvm::classification_report{ correct_label, predicted_label } << std::endl;
 
-        // write model file to disk
+        // write the model file to disk
         model.save("model_file.libsvm");
     } catch (const plssvm::exception &e) {
         std::cerr << e.what_with_loc() << std::endl;
@@ -952,7 +965,7 @@ int main() {
 }
 ```
 
-A simple C++ program (`main_regression.cpp`) using PLSSVM as library for regression could look like:
+A simple C++ program (`main_regression.cpp`) using PLSSVM as a library for regression could look like:
 
 ```cpp
 #include "plssvm/core.hpp"
@@ -990,7 +1003,7 @@ int main() {
         const std::vector<plssvm::real_type> &correct_values = test_data.labels().value();
         std::cout << plssvm::regression_report{ correct_label, predicted_label } << std::endl;
 
-        // write model file to disk
+        // write the model file to disk
         model.save("model_file.libsvm");
     } catch (const plssvm::exception &e) {
         std::cerr << e.what_with_loc() << std::endl;
@@ -1089,7 +1102,7 @@ plt.scatter(X[:, 0], X[:, 1],
             c=y,
             s=20, edgecolors="k")
 
-# generate legend handles and add handle
+# generate legend handles
 legend_handles = [plt.scatter([], [], color=viridis(color), label=f'{label}')
                   for label, color in zip(y_label, np.unique(y))]
 plt.legend(handles=legend_handles)
diff --git a/bindings/Python/README.md b/bindings/Python/README.md
index 504d2533b..bcc2c2c4f 100644
--- a/bindings/Python/README.md
+++ b/bindings/Python/README.md
@@ -321,7 +321,7 @@ The following table lists all PLSSVM enumerations exposed on the Python side:
 | enumeration            | values                                                                           | description                                                                                                                                                                                                                                                 |
 |------------------------|----------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `TargetPlatform`       | `AUTOMATIC`, `CPU`, `GPU_NVIDIA`, `GPU_AMD`, `GPU_INTEL`                         | The different supported target platforms (default: `AUTOMATIC`). If `AUTOMATIC` is provided, checks for available devices in the following order: NVIDIA GPUs -> AMD GPUs -> Intel GPUs -> CPUs.                                                            |
-| `SolverType`           | `AUTOMATIC`, `CG_EXPLICIT`, `CG_IMPLICIT`                                        | The different supported solver types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the used solver types depends on the available device and system memory.                                                                                           |
+| `SolverType`           | `AUTOMATIC`, `CG_EXPLICIT`, `CG_STREAMING`, `CG_IMPLICIT`                        | The different supported solver types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the used solver types depends on the available device and system memory.                                                                                           |
 | `KernelFunctionType`   | `LINEAR`, `POLYNOMIAL`, `RBF`, `SIGMOID`, `LAPLACIAN`, `CHI_SQUARED`             | The different supported kernel functions (default: `RBF`).                                                                                                                                                                                                  |
 | `FileFormatType`       | `LIBSVM`, `ARFF`                                                                 | The different supported file format types (default: `LIBSVM`).                                                                                                                                                                                              |
 | `GammaCoefficientType` | `AUTOMATIC`, `SCALE`                                                             | The different modes for the dynamic gamma calculation (default: `AUTOMATIC`).                                                                                                                                                                               |
@@ -332,10 +332,10 @@ The following table lists all PLSSVM enumerations exposed on the Python side:
 
 If a SYCL implementation is available, additional enumerations are available:
 
-| enumeration            | values                                                       | description                                                                                                                                                                                                                                               |
-|------------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `ImplementationType`   | `AUTOMATIC`, `DPCPP`, `ADAPTIVECPP`                          | The different supported SYCL implementation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, determines the used SYCL implementation based on the value of `-DPLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` provided during PLSSVM'S build step. |
-| `KernelInvocationType` | `AUTOMATIC`, `BASIC`, `WORK_GROUP`, `HIERARCHICAL`, `SCOPED` | The different supported SYCL kernel invocation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, simply uses `WORK_GROUP`.                                                                                                                        |
+| enumeration          | values                                                       | description                                                                                                                                                                                                                                               |
+|----------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `ImplementationType` | `AUTOMATIC`, `DPCPP`, `ADAPTIVECPP`                          | The different supported SYCL implementation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, determines the used SYCL implementation based on the value of `-DPLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` provided during PLSSVM'S build step. |
+| `DataParallelKernel` | `AUTOMATIC`, `BASIC`, `WORK_GROUP`, `HIERARCHICAL`, `SCOPED` | The different supported SYCL data parallel kernels (default: `AUTOMATIC`). If `AUTOMATIC` is provided, simply uses `WORK_GROUP`.                                                                                                                          |
 
 If the stdpar backend is available, an additional enumeration is available:
 
@@ -469,7 +469,7 @@ The following constructors and methods are available for both classification `CS
 
 **Note**: if the backend type is `plssvm.BackendType.SYCL` two additional named parameters can be provided:
 `sycl_implementation_type` to choose between DPC++ and AdaptiveCpp as SYCL implementations
-and `sycl_kernel_invocation_type` to choose between the two different SYCL kernel invocation types.
+and `sycl_data_parallel_kernel` to choose between the different SYCL data parallel kernels.
 
 **Note**: if the backend type is `plssvm.BackendType.HPX` or `plssvm.BackendType.Kokkos` special initialization and
 finalization functions must be called.
@@ -519,12 +519,12 @@ The following constructors and methods are available for both classification `CS
 | `CSVC(target, *, kernel_type=plssvm.KernelFunctionType.RBF, degree=3, gamma=plssvm.GammaCoefficientType.AUTO, coef0=0.0, cost=1.0, comm=*used MPI communicator*)` | Create a new C-SVM with the provided parameters and named arguments.                |
 
 In case of the SYCL C-SVMs (`plssvm.sycl.CSVM`, `plssvm.dpcpp.CSVM`, and `plssvm.adaptivecpp.CSVM`; the same for the 
-`CSVR`s), additionally, all constructors also accept the SYCL specific `sycl_kernel_invocation_type` keyword parameter.
+`CSVR`s), additionally, all constructors also accept the SYCL specific `sycl_data_parallel_kernel` keyword parameter.
 Also, the following method is additional available for the backend specific C-SVM:
 
-| methods                        | description                             |
-|--------------------------------|-----------------------------------------|
-| `get_kernel_invocation_type()` | Return the SYCL kernel invocation type. |
+| methods                      | description                                |
+|------------------------------|--------------------------------------------|
+| `get_data_parallel_kernel()` | Return the used SYCL data parallel kernel. |
 
 In case of the stdpar C-SVM (`plssvm.stdpar.CSVC` and `plssvm.stdpar.CSVR`) the following method is additional available for the backend specific
 C-SVM.
diff --git a/bindings/Python/backends/adaptivecpp_csvm.cpp b/bindings/Python/backends/adaptivecpp_csvm.cpp
index bf43d85f1..f9cc57b3a 100644
--- a/bindings/Python/backends/adaptivecpp_csvm.cpp
+++ b/bindings/Python/backends/adaptivecpp_csvm.cpp
@@ -6,20 +6,20 @@
  *          See the LICENSE.md file in the project root for full license information.
  */
 
-#include "plssvm/backend_types.hpp"                          // plssvm::adaptivecpp::backend_csvm_type_t
-#include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp"         // plssvm::adaptivecpp::csvm
-#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::adaptivecpp::backend_exception
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::exception
-#include "plssvm/gamma.hpp"                                  // plssvm::gamma
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/parameter.hpp"                              // plssvm::parameter
-#include "plssvm/svm/csvc.hpp"                               // plssvm::csvc
-#include "plssvm/svm/csvm.hpp"                               // plssvm::csvm
-#include "plssvm/svm/csvr.hpp"                               // plssvm::csvr
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::adaptivecpp::backend_csvm_type_t
+#include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp"       // plssvm::adaptivecpp::csvm
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/exceptions.hpp"             // plssvm::adaptivecpp::backend_exception
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::exception
+#include "plssvm/gamma.hpp"                                // plssvm::gamma
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/parameter.hpp"                            // plssvm::parameter
+#include "plssvm/svm/csvc.hpp"                             // plssvm::csvc
+#include "plssvm/svm/csvm.hpp"                             // plssvm::csvm
+#include "plssvm/svm/csvr.hpp"                             // plssvm::csvr
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "bindings/Python/type_caster/mpi_type_caster.hpp"  // a custom Pybind11 type caster for a plssvm::mpi::communicator
 #include "bindings/Python/utility.hpp"                      // plssvm::bindings::python::util::register_py_exception
@@ -49,18 +49,18 @@ void bind_adaptivecpp_csvms(py::module_ &m, const std::string &csvm_name) {
     const std::string keyword_args_constructor_docstring{ fmt::format("create an AdaptiveCpp SYCL {} with the provided SVM parameter as separate keyword arguments including optional SYCL specific keyword arguments", csvm_name) };
 
     py::class_<backend_csvm_type, plssvm::adaptivecpp::csvm, csvm_type>(m, csvm_name.c_str(), class_docstring.c_str())
-        .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) {
-                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation);
+        .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) {
+                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type);
              }),
              params_constructor_docstring.c_str(),
              py::arg("target") = plssvm::target_platform::automatic,
              py::kw_only(),
              py::arg("params") = default_params,
-             py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic,
+             py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic,
              py::arg("comm") = plssvm::mpi::communicator{})
-        .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) {
+        .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) {
                  const plssvm::parameter params{ kernel_type, degree, gamma, coef0, cost };
-                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation);
+                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type);
              }),
              keyword_args_constructor_docstring.c_str(),
              py::arg("target") = plssvm::target_platform::automatic,
@@ -70,11 +70,11 @@ void bind_adaptivecpp_csvms(py::module_ &m, const std::string &csvm_name) {
              py::arg("gamma") = default_params.gamma,
              py::arg("coef0") = default_params.coef0,
              py::arg("cost") = default_params.cost,
-             py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic,
+             py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic,
              py::arg("comm") = plssvm::mpi::communicator{})
-        .def("get_kernel_invocation_type", &plssvm::adaptivecpp::csvm::get_kernel_invocation_type, "get the kernel invocation type used in this SYCL C-SVM")
+        .def("get_data_parallel_kernel", &plssvm::adaptivecpp::csvm::get_data_parallel_kernel, "get the data parallel kernel used in this SYCL C-SVM")
         .def("__repr__", [csvm_name](const backend_csvm_type &self) {
-            return fmt::format("<plssvm.adaptivecpp.{} with {{ #devices: {}, kernel_invocation_type: {} }}>", csvm_name, self.num_available_devices(), self.get_kernel_invocation_type());
+            return fmt::format("<plssvm.adaptivecpp.{} with {{ #devices: {}, data_parallel_kernel: {} }}>", csvm_name, self.num_available_devices(), self.get_data_parallel_kernel());
         });
 }
 
diff --git a/bindings/Python/backends/dpcpp_csvm.cpp b/bindings/Python/backends/dpcpp_csvm.cpp
index 51dcd7e16..152849bd7 100644
--- a/bindings/Python/backends/dpcpp_csvm.cpp
+++ b/bindings/Python/backends/dpcpp_csvm.cpp
@@ -6,20 +6,20 @@
  *          See the LICENSE.md file in the project root for full license information.
  */
 
-#include "plssvm/backend_types.hpp"                          // plssvm::dpcpp::backend_csvm_type_t
-#include "plssvm/backends/SYCL/DPCPP/csvm.hpp"               // plssvm::dpcpp::csvm
-#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::dpcpp::backend_exception
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::exception
-#include "plssvm/gamma.hpp"                                  // plssvm::gamma
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/parameter.hpp"                              // plssvm::parameter
-#include "plssvm/svm/csvc.hpp"                               // plssvm::csvc
-#include "plssvm/svm/csvm.hpp"                               // plssvm::csvm
-#include "plssvm/svm/csvr.hpp"                               // plssvm::csvr
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::dpcpp::backend_csvm_type_t
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/DPCPP/csvm.hpp"             // plssvm::dpcpp::csvm
+#include "plssvm/backends/SYCL/exceptions.hpp"             // plssvm::dpcpp::backend_exception
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::exception
+#include "plssvm/gamma.hpp"                                // plssvm::gamma
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/parameter.hpp"                            // plssvm::parameter
+#include "plssvm/svm/csvc.hpp"                             // plssvm::csvc
+#include "plssvm/svm/csvm.hpp"                             // plssvm::csvm
+#include "plssvm/svm/csvr.hpp"                             // plssvm::csvr
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "bindings/Python/type_caster/mpi_type_caster.hpp"  // a custom Pybind11 type caster for a plssvm::mpi::communicator
 #include "bindings/Python/utility.hpp"                      // plssvm::bindings::python::util::register_py_exception
@@ -49,18 +49,18 @@ void bind_dpcpp_csvms(py::module_ &m, const std::string &csvm_name) {
     const std::string keyword_args_constructor_docstring{ fmt::format("create a DPC++ SYCL {} with the provided SVM parameter as separate keyword arguments including optional SYCL specific keyword arguments", csvm_name) };
 
     py::class_<backend_csvm_type, plssvm::dpcpp::csvm, csvm_type>(m, csvm_name.c_str(), class_docstring.c_str())
-        .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) {
-                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation);
+        .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) {
+                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type);
              }),
              params_constructor_docstring.c_str(),
              py::arg("target") = plssvm::target_platform::automatic,
              py::kw_only(),
              py::arg("params") = default_params,
-             py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic,
+             py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic,
              py::arg("comm") = plssvm::mpi::communicator{})
-        .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) {
+        .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) {
                  const plssvm::parameter params{ kernel_type, degree, gamma, coef0, cost };
-                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation);
+                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type);
              }),
              keyword_args_constructor_docstring.c_str(),
              py::arg("target") = plssvm::target_platform::automatic,
@@ -70,11 +70,11 @@ void bind_dpcpp_csvms(py::module_ &m, const std::string &csvm_name) {
              py::arg("gamma") = default_params.gamma,
              py::arg("coef0") = default_params.coef0,
              py::arg("cost") = default_params.cost,
-             py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic,
+             py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic,
              py::arg("comm") = plssvm::mpi::communicator{})
-        .def("get_kernel_invocation_type", &plssvm::dpcpp::csvm::get_kernel_invocation_type, "get the kernel invocation type used in this SYCL C-SVM")
+        .def("get_data_parallel_kernel", &plssvm::dpcpp::csvm::get_data_parallel_kernel, "get the data parallel kernel used in this SYCL C-SVM")
         .def("__repr__", [csvm_name](const backend_csvm_type &self) {
-            return fmt::format("<plssvm.dpcpp.{} with {{ #devices: {}, kernel_invocation_type: {} }}>", csvm_name, self.num_available_devices(), self.get_kernel_invocation_type());
+            return fmt::format("<plssvm.dpcpp.{} with {{ #devices: {}, data_parallel_kernel: {} }}>", csvm_name, self.num_available_devices(), self.get_data_parallel_kernel());
         });
 }
 
diff --git a/bindings/Python/backends/sycl.cpp b/bindings/Python/backends/sycl.cpp
index 98c27214b..3bf6b6c30 100644
--- a/bindings/Python/backends/sycl.cpp
+++ b/bindings/Python/backends/sycl.cpp
@@ -6,10 +6,10 @@
  *          See the LICENSE.md file in the project root for full license information.
  */
 
-#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::sycl::backend_exception
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::{implementation_type, list_available_sycl_implementations}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::exception
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/exceptions.hpp"             // plssvm::sycl::backend_exception
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::{implementation_type, list_available_sycl_implementations}
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::exception
 
 #include "bindings/Python/utility.hpp"  // plssvm::bindings::python::util::{register_py_exception, register_implicit_str_enum_conversion}
 
@@ -45,16 +45,16 @@ void init_sycl(py::module_ &m, const py::exception<plssvm::exception> &base_exce
 
     sycl_module.def("list_available_sycl_implementations", &plssvm::sycl::list_available_sycl_implementations, "list all available SYCL implementations");
 
-    py::enum_<plssvm::sycl::kernel_invocation_type> py_enum_invocation(sycl_module, "KernelInvocationType", "Enum class for all possible SYCL kernel invocation types supported in PLSSVM.");
-    py_enum_invocation
-        .value("AUTOMATIC", plssvm::sycl::kernel_invocation_type::automatic, "use the best kernel invocation type for the current SYCL implementation and target hardware platform")
-        .value("BASIC", plssvm::sycl::kernel_invocation_type::basic, "use the basic data parallel kernel invocation type")
-        .value("WORK_GROUP", plssvm::sycl::kernel_invocation_type::work_group, "use the work-group data parallel kernel invocation type")
-        .value("HIERARCHICAL", plssvm::sycl::kernel_invocation_type::hierarchical, "use the hierarchical data parallel kernel invocation type")
-        .value("SCOPED", plssvm::sycl::kernel_invocation_type::scoped, "use the AdaptiveCpp specific scoped parallelism kernel invocation type");
+    py::enum_<plssvm::sycl::data_parallel_kernel> py_enum_data_parallel_kernel(sycl_module, "DataParallelKernel", "Enum class for all possible SYCL data parallel kernels supported in PLSSVM.");
+    py_enum_data_parallel_kernel
+        .value("AUTOMATIC", plssvm::sycl::data_parallel_kernel::automatic, "use the best data parallel kernel for the current SYCL implementation and target hardware platform")
+        .value("BASIC", plssvm::sycl::data_parallel_kernel::basic, "use the basic data parallel kernel")
+        .value("WORK_GROUP", plssvm::sycl::data_parallel_kernel::work_group, "use the work-group data parallel kernel")
+        .value("HIERARCHICAL", plssvm::sycl::data_parallel_kernel::hierarchical, "use the hierarchical data parallel kernel")
+        .value("SCOPED", plssvm::sycl::data_parallel_kernel::scoped, "use the AdaptiveCpp specific scoped parallelism kernel");
 
     // enable implicit conversion from string to enum
-    plssvm::bindings::python::util::register_implicit_str_enum_conversion<plssvm::sycl::kernel_invocation_type>(py_enum_invocation);
+    plssvm::bindings::python::util::register_implicit_str_enum_conversion<plssvm::sycl::data_parallel_kernel>(py_enum_data_parallel_kernel);
 
     // initialize SYCL binding classes
 #if defined(PLSSVM_SYCL_BACKEND_HAS_ADAPTIVECPP)
diff --git a/bindings/Python/solver_types.cpp b/bindings/Python/solver_types.cpp
index f8309fb4b..cb6ca843c 100644
--- a/bindings/Python/solver_types.cpp
+++ b/bindings/Python/solver_types.cpp
@@ -20,6 +20,7 @@ void init_solver_types(py::module_ &m) {
     py_enum
         .value("AUTOMATIC", plssvm::solver_type::automatic, "the default solver type; depends on the available device and system memory")
         .value("CG_EXPLICIT", plssvm::solver_type::cg_explicit, "explicitly assemble the kernel matrix on the device")
+        .value("CG_STREAMING", plssvm::solver_type::cg_streaming, "explicitly calculate the kernel matrix and fully store it on the host; realized using unified shared memory")
         .value("CG_IMPLICIT", plssvm::solver_type::cg_implicit, "implicitly calculate the kernel matrix entries in each CG iteration");
 
     // enable implicit conversion from string to enum
diff --git a/bindings/Python/svm/utility.hpp b/bindings/Python/svm/utility.hpp
index 38019bf8b..5b3bb823a 100644
--- a/bindings/Python/svm/utility.hpp
+++ b/bindings/Python/svm/utility.hpp
@@ -13,14 +13,14 @@
 #define PLSSVM_BINDINGS_PYTHON_SVM_UTILITY_HPP_
 #pragma once
 
-#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/csvm_factory.hpp"                           // plssvm::make_csvm
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/parameter.hpp"                              // plssvm::parameter, named arguments
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/csvm_factory.hpp"                         // plssvm::make_csvm
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/parameter.hpp"                            // plssvm::parameter, named arguments
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "bindings/Python/utility.hpp"  // plssvm::bindings::python::util::check_kwargs_for_correctness
 
@@ -46,7 +46,7 @@ namespace plssvm::bindings::python::util {
 template <typename csvm_type>
 [[nodiscard]] inline std::unique_ptr<csvm_type> assemble_csvm(const plssvm::backend_type backend, const plssvm::target_platform target, const plssvm::parameter &params, plssvm::mpi::communicator comm, const py::kwargs &optional_args) {
     // check keyword arguments
-    plssvm::bindings::python::util::check_kwargs_for_correctness(optional_args, { "foo", "sycl_implementation_type", "sycl_kernel_invocation_type", "kokkos_execution_space" });
+    plssvm::bindings::python::util::check_kwargs_for_correctness(optional_args, { "foo", "sycl_implementation_type", "sycl_data_parallel_kernel", "kokkos_execution_space" });
 
     if (backend == plssvm::backend_type::sycl) {
         // parse SYCL specific keyword arguments
@@ -54,12 +54,12 @@ template <typename csvm_type>
         if (optional_args.contains("sycl_implementation_type")) {
             impl_type = optional_args["sycl_implementation_type"].cast<plssvm::sycl::implementation_type>();
         }
-        plssvm::sycl::kernel_invocation_type invocation_type = plssvm::sycl::kernel_invocation_type::automatic;
-        if (optional_args.contains("sycl_kernel_invocation_type")) {
-            invocation_type = optional_args["sycl_kernel_invocation_type"].cast<plssvm::sycl::kernel_invocation_type>();
+        plssvm::sycl::data_parallel_kernel data_parallel_kernel_type = plssvm::sycl::data_parallel_kernel::automatic;
+        if (optional_args.contains("sycl_data_parallel_kernel")) {
+            data_parallel_kernel_type = optional_args["sycl_data_parallel_kernel"].cast<plssvm::sycl::data_parallel_kernel>();
         }
 
-        return plssvm::make_csvm<csvm_type>(backend, std::move(comm), target, params, plssvm::sycl_implementation_type = impl_type, plssvm::sycl_kernel_invocation_type = invocation_type);
+        return plssvm::make_csvm<csvm_type>(backend, std::move(comm), target, params, plssvm::sycl_implementation_type = impl_type, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type);
     } else if (backend == plssvm::backend_type::kokkos) {
         // parse Kokkos specific keyword arguments
         plssvm::kokkos::execution_space space = plssvm::kokkos::execution_space::automatic;
diff --git a/docs/plssvm-train.1.in b/docs/plssvm-train.1.in
index deae26bc1..f5976c39c 100644
--- a/docs/plssvm-train.1.in
+++ b/docs/plssvm-train.1.in
@@ -48,7 +48,7 @@ the maximum number of CG iterations (default: #features)
 
 .TP
 .B -l, --solver arg
-choose the solver: automatic|cg_explicit|cg_implicit (default: automatic)
+choose the solver: automatic|cg_explicit|cg_streaming|cg_implicit (default: automatic)
 
 .TP
 .B -a, --classification arg
diff --git a/include/plssvm/backends/CUDA/csvm.hpp b/include/plssvm/backends/CUDA/csvm.hpp
index ec02c80c0..c45902c8e 100644
--- a/include/plssvm/backends/CUDA/csvm.hpp
+++ b/include/plssvm/backends/CUDA/csvm.hpp
@@ -22,6 +22,7 @@
 #include "plssvm/detail/type_traits.hpp"                  // PLSSVM_REQUIRES, plssvm::detail::is_one_type_of
 #include "plssvm/mpi/communicator.hpp"                    // plssvm::mpi::communicator
 #include "plssvm/parameter.hpp"                           // plssvm::parameter, plssvm::detail::has_only_parameter_named_args_v
+#include "plssvm/solver_types.hpp"                        // plssvm::solver_type
 #include "plssvm/svm/csvc.hpp"                            // plssvm::csvc
 #include "plssvm/svm/csvm.hpp"                            // plssvm::detail::csvm_backend_exists
 #include "plssvm/svm/csvr.hpp"                            // plssvm::csvr
@@ -109,7 +110,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/CUDA/detail/device_ptr.cuh b/include/plssvm/backends/CUDA/detail/device_ptr.cuh
index de2d8546d..e361a8d1d 100644
--- a/include/plssvm/backends/CUDA/detail/device_ptr.cuh
+++ b/include/plssvm/backends/CUDA/detail/device_ptr.cuh
@@ -32,6 +32,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,24 +59,27 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
      * @brief Allocates `size * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] device the associated CUDA device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(size_type size, queue_type device);
+    device_ptr(size_type size, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] device the associated CUDA device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(plssvm::shape shape, queue_type device);
+    device_ptr(plssvm::shape shape, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] device the associated CUDA device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
index 2f7b37a0f..ab6c7b11b 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_CUDA_KERNEL_CG_EXPLICIT_BLAS_CUH_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+
+#include <cstddef>  // std::size_t
 
 namespace plssvm::cuda::detail {
 
@@ -22,8 +24,8 @@ namespace plssvm::cuda::detail {
  * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for!
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -32,78 +34,77 @@ namespace plssvm::cuda::detail {
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_symm(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # rhs -> num_rhs
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # rows -> device_specific_num_rows
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_i = i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_j = j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // determine on which side of the diagonal we are located
-            if (dim + threadIdx_y < global_j) {
-                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + 1ull) / 2ull];
-            } else {
-                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y - global_j * (global_j + 1ull) / 2ull];
-            }
-            // determine on which side of the diagonal we are located
-            if (dim + threadIdx.y + THREAD_BLOCK_SIZE < global_j) {
-                A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull + 1ull) / 2ull];
-            } else {
-                A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - global_j * (global_j + 1ull) / 2ull];
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+        // iterate over all values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                // determine on which side of the diagonal we are located
+                if (dim_block + threadIdx_y < global_j_idx_linear) {
+                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                } else {
+                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                }
+                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
             }
-
-            B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i];
-            B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
-
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
+    // calculate the indices used in the current thread
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
     // apply the (partial) BLAS operation and update C
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-            const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
-
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && device_global_j < device_specific_num_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i];
+            // calculate the indices to access the global data and the data with respect to the current device
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+            const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -115,8 +116,8 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
  * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -125,68 +126,72 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_symm_mirror(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long num_mirror_rows, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # rhs -> num_rhs
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # rows -> num_mirror_rows
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_i = i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_j = j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y - 1ull) * (dim + threadIdx_y) / 2ull + device_specific_num_rows - (dim + threadIdx_y) + global_j];
-            A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - 1ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) / 2ull + device_specific_num_rows - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) + global_j];
-            B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i];
-            B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
-
-        // perform the feature reduction calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_mirror_rows
+
+        // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                               // SoA
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
+    // calculate the indices used in the current thread
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
+
     // apply the (remaining) BLAS operation and update C
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto partial_global_j = j + static_cast<unsigned long long>(internal_j);
-            const auto global_j = row_offset + device_specific_num_rows + j + static_cast<unsigned long long>(internal_j);
-
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && partial_global_j < num_mirror_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i];
+            // calculate the indices to access the global data and the data with respect to the current device
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+            const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
+
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -200,27 +205,29 @@ __global__ void device_kernel_symm_mirror(const unsigned long long num_rows, con
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_inplace_matrix_add(const unsigned long long num_cols, real_type *lhs, const real_type *rhs, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # num_rows
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # num_rhs
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto global_j = j + static_cast<unsigned long long>(internal_j);
+            // calculate the indices to access the global data
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j];
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx];  // SoA
         }
     }
 }
@@ -233,27 +240,29 @@ __global__ void device_kernel_inplace_matrix_add(const unsigned long long num_co
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_inplace_matrix_scale(const unsigned long long num_cols, real_type *lhs, const real_type scale, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # num_rows
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # num_rhs
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto global_j = j + static_cast<unsigned long long>(internal_j);
+            // calculate the indices to access the global data
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] *= scale;
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale;  // SoA
         }
     }
 }
diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
index 8a766b7db..70c9b4101 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
@@ -14,20 +14,22 @@
 #pragma once
 
 #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh"  // plssvm::cuda::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::cuda::detail {
 
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
- * @param[out] kernel_matrix_d the calculated kernel matrix
- * @param[in] data_d the data points to calculate the kernel matrix from
+ * @param[out] kernel_matrix the calculated kernel matrix
+ * @param[in] data the data points to calculate the kernel matrix from
  * @param[in] num_rows the total number of data points (= total number of rows)
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] q the vector used in the dimensional reduction
  * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -37,80 +39,84 @@ namespace plssvm::cuda::detail {
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type *q, const real_type QA_cost, const real_type cost, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
     if (blockIdx_x >= blockIdx_y) {
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-                const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-                data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-            }
-            __syncthreads();  // wait until all threads loaded their part of the data
-
-            // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+            const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into shared memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the shared memory
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                }
+                __syncthreads();  // wait until all threads loaded their part of the data
+
+                // perform the feature reduction calculation
+                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
                     }
                 }
+                __syncthreads();  // wait until all threads performed their part of the calculations
             }
-            __syncthreads();  // wait until all threads performed their part of the calculations
         }
 
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                // calculate the indices to access the kernel matrix (the part stored on the current device)
-                const auto device_global_i = i + static_cast<unsigned long long>(internal_i);
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
-
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                     real_type temp_ij = temp[internal_i][internal_j];
-                    temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j];
+                    // apply the final kernel function
+                    temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp_ij += cost;
                     }
-                    // update the kernel matrix
-                    kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ull) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij;
+                    // update the upper triangular kernel matrix
+                    kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                 }
             }
         }
diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
index 62f24d6bf..186400757 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
@@ -15,9 +15,11 @@
 
 #include "plssvm/backends/CUDA/kernel/detail/atomics.cuh"    // atomicAdd for double precision floating point numbers on older CUDA hardware
 #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh"  // plssvm::cuda::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::cuda::detail {
 
 /**
@@ -26,10 +28,10 @@ namespace plssvm::cuda::detail {
  * @tparam Args the types of the parameters necessary for the specific kernel function
  * @param[in] alpha the scalar alpha value
  * @param[in] q the vector used in the dimensional reduction
- * @param[in] data_d the data points to calculate the implicit kernel matrix from
+ * @param[in] data the data points to calculate the implicit kernel matrix from
  * @param[in] num_rows the total number of data points (= total number of rows)
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] QA_cost the scalar used in the dimensional reduction
  * @param[in] cost the cost factor the diagonal is scaled with
@@ -41,56 +43,64 @@ namespace plssvm::cuda::detail {
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const unsigned long long num_classes, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
+    // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+    const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+    const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
     if (blockIdx_x >= blockIdx_y) {
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
+        //*************************************************************************//
+        //                   inplace kernel matrix construction                    //
+        //*************************************************************************//
         {
-            // create the shared memory arrays used for caching data point features
-            __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-            __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            auto data_i_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+            auto data_j_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-                    const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                    data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                    data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-                    data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the shared memory
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                    data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
                         }
                     }
                 }
@@ -101,16 +111,18 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto device_global_i = i + static_cast<unsigned long long>(internal_i);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
-                const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) {
-                    temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                    // apply the final kernel function
+                    temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp[internal_i][internal_j] += cost;
                     }
                 } else {
@@ -120,42 +132,44 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
             }
         }
 
-        // calculate C += alpha * temp * B for the UPPER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+        //*************************************************************************//
         {
-            // same shared memory size but with different dimensions
-            __shared__ real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
-            __shared__ real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+            auto B_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_one);
+            auto C_out_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y];
-                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 };
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                    // store the values in the shared memory
+                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
+                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 };                                                                          // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE] +=
-                                temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE];
+                            C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE] +=
+                                temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE];
                         }
                     }
                     __syncthreads();  // wait until all threads performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_j = row_offset + j + static_cast<unsigned long long>(internal);
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]);
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x + THREAD_BLOCK_SIZE]);
+                    // calculate the indices to access the global data
+                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal);
+
+                    atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]);  // SoA
                 }
                 __syncthreads();  // wai until all threads updated C with their values
             }
@@ -164,51 +178,55 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
         // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
-                if (global_i == global_j) {
+                // update the diagonal
+                if (global_i_idx == global_j_idx) {
                     temp[internal_i][internal_j] = real_type{ 0.0 };
                 }
             }
         }
 
-        // calculate C += alpha * temp * B for the LOWER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+        //*************************************************************************//
         {
-            // same shared memory size but with different dimensions
-            __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-            __shared__ real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            auto B_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+            auto C_out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y];
-                    B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
+                    // store the values in the shared memory
+                    B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
                     C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
-                    C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] +=
-                                temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j];
+                            C_out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] +=
+                                temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j];
                         }
                     }
                     __syncthreads();  // wait until all threads performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i + static_cast<unsigned long long>(internal);
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal);
+
+                    atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);  // SoA
                 }
                 __syncthreads();  // wait until all threads updated C with their values
             }
diff --git a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh
index 8003a51a3..7748c45c8 100644
--- a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh
+++ b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh
@@ -51,42 +51,17 @@ template <>
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] __device__ __forceinline__ real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
index 204d6bd97..d7ebf45a3 100644
--- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
+++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
@@ -15,166 +15,180 @@
 
 #include "plssvm/backends/CUDA/kernel/detail/atomics.cuh"    // atomicAdd for double precision floating point numbers on older CUDA hardware
 #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh"  // plssvm::cuda::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::cuda::detail {
 
 /**
  * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
- * @param[out] w_d the vector to speedup the linear prediction
- * @param[in] alpha_d the previously learned weights
- * @param[in] sv_d the support vectors
+ * @param[out] w the vector to speedup the linear prediction
+ * @param[in] alpha the previously learned weights
+ * @param[in] support_vectors the support vectors
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+ * @param[in] device_num_sv the number of support vectors the current device is responsible for
+ * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long device_specific_num_sv, const unsigned long long sv_offset, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_feature_idx = feature_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_class_idx = class_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_features
+        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;    // num_classes
 
-            data_cache_feature[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ull) + sv + threadIdx_y];  // SoA
-            data_cache_alpha[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ull) + sv + sv_offset + threadIdx_y];       // AoS
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_feature][internal_class] += data_cache_alpha[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature];
+                // store the values in the shared memory
+                feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y];  // SoA
+                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y];    // AoS
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_feature][internal_class] += alpha_cache[sv][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
-    // update global array with local one
+    // calculate the indices used in the current thread
+    const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
+
+    // update the global w-vector with the locally cached values
     for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-            const auto global_feature_idx = feature_idx + static_cast<unsigned long long>(internal_feature);
-            const auto global_class_idx = class_idx + static_cast<unsigned long long>(internal_class);
+            // calculate the indices to access the global data
+            const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-            w_d[global_feature_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_feature][internal_class];
+            w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
         }
     }
 }
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
- * @param[out] prediction_d the predicted values
- * @param[in] w_d the vector to speedup the calculations
- * @param[in] rho_d the previously learned bias
- * @param[in] predict_points_d the data points to predict
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
+ * @param[out] prediction the predicted values
+ * @param[in] w the vector to speedup the calculations
+ * @param[in] rho the previously learned bias
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_predict_points the number of data points to predict
  * @param[in] num_features the number of features per data point
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_pp_idx = pp_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_class_idx = class_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-            data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-            data_cache_w[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ull) + global_class_idx];
-            data_cache_w[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_classes + PADDING_SIZE_ull) + global_class_idx];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;     // num_predict_points
+        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_classes
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_pd][internal_class] += data_cache_w[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd];
+                // store the values in the shared memory
+                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_pp][internal_class] += w_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
-    // update global array with local one
-    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    // calculate the indices used in the current thread
+    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+    // update the global array with the local one
+    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-            const auto global_pp_idx = pp_idx + static_cast<unsigned long long>(internal_pd);
-            const auto global_class_idx = class_idx + static_cast<unsigned long long>(internal_class);
+            // calculate the indices to access the global data
+            const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-            prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx];
+            prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx];  // AoS
         }
     }
 }
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
- * @param[in] prediction_d the predicted values
- * @param[in] alpha_d the previously learned weights
- * @param[in] rho_d the previously learned biases
- * @param[in] sv_d the support vectors
- * @param[in] predict_points_d the data points to predict
+ * @param[in] prediction the predicted values
+ * @param[in] alpha the previously learned weights
+ * @param[in] rho the previously learned biases
+ * @param[in] support_vectors the support vectors
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
  * @param[in] num_predict_points the number of data points to predict
@@ -184,53 +198,55 @@ __global__ void device_kernel_predict_linear(real_type *prediction_d, const real
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
+__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
     {
-        // create the shared memory arrays used for caching data point features
-        __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        __shared__ real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        auto pp_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+        auto sv_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
+
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_predict_points
+        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
 
         // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE;
-                const auto global_sv_idx = sv_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-                data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-                data_cache_sv[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
-                data_cache_sv[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];             // SoA
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                  data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]);
+                        temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                  pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]);
                     }
                 }
             }
@@ -239,55 +255,57 @@ __global__ void device_kernel_predict(real_type *prediction_d, const real_type *
     }
 
     // update temp using the respective kernel function
-    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-            temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter...);
+            temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter...);
         }
     }
 
     {
-        // same shared memory size but with different dimensions
-        __shared__ real_type alpha_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        __shared__ real_type out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        auto alpha_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+        auto out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+        // calculate the indices used in the current thread
+        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+        // iterate over all classes using blocking to be able to cache them for faster memory accesses
+        for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const unsigned long long global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
-                alpha_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                // store the values in the shared memory
+                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                 // the bias (rho) must only be applied once for all support vectors
-                if (blockIdx_y == 0ull) {
-                    out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y];
-                    out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
+                if (blockIdx_y == std::size_t{ 0 }) {
+                    out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y];
                 } else {
                     out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
-                    out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
                 }
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // calculate intermediate results and store them in shared memory
-            for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + threadIdx.x] +=
-                            temp[internal_pd][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv];
+                        out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] +=
+                            temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv];
                     }
                 }
                 __syncthreads();  // wait until all threads performed their part of the calculations
             }
 
-            // add intermediate cached results to prediction_d
+            // atomically add the intermediate cached results to the prediction
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx + static_cast<unsigned long long>(internal);
+                // calculate the indices to access the global data
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
+                atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
             }
             __syncthreads();  // wait until all threads updated their part of the prediction
         }
diff --git a/include/plssvm/backends/HIP/csvm.hpp b/include/plssvm/backends/HIP/csvm.hpp
index e1f64e58e..86dd0af9b 100644
--- a/include/plssvm/backends/HIP/csvm.hpp
+++ b/include/plssvm/backends/HIP/csvm.hpp
@@ -109,7 +109,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/HIP/detail/device_ptr.hip.hpp b/include/plssvm/backends/HIP/detail/device_ptr.hip.hpp
index 26490d4ac..1e4462169 100644
--- a/include/plssvm/backends/HIP/detail/device_ptr.hip.hpp
+++ b/include/plssvm/backends/HIP/detail/device_ptr.hip.hpp
@@ -32,6 +32,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,24 +59,27 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
      * @brief Allocates `size * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] device the associated HIP device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::hip::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(size_type size, queue_type device);
+    explicit device_ptr(size_type size, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] device the associated HIP device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::hip::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(plssvm::shape shape, queue_type device);
+    explicit device_ptr(plssvm::shape shape, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
-     * @param[in] device the associated CUDA device
+     * @param[in] device the associated HIP device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp
index 124688d3a..9f5821634 100644
--- a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp
@@ -13,11 +13,13 @@
 #define PLSSVM_BACKENDS_HIP_CG_EXPLICIT_BLAS_HIP_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::hip::detail {
 
 /**
@@ -25,8 +27,8 @@ namespace plssvm::hip::detail {
  * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for!
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -35,78 +37,77 @@ namespace plssvm::hip::detail {
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_symm(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # rhs -> num_rhs
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # rows -> device_specific_num_rows
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_i = i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_j = j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // determine on which side of the diagonal we are located
-            if (dim + threadIdx_y < global_j) {
-                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + 1ull) / 2ull];
-            } else {
-                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y - global_j * (global_j + 1ull) / 2ull];
-            }
-            // determine on which side of the diagonal we are located
-            if (dim + threadIdx.y + THREAD_BLOCK_SIZE < global_j) {
-                A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull + 1ull) / 2ull];
-            } else {
-                A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - global_j * (global_j + 1ull) / 2ull];
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+        // iterate over all values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                // determine on which side of the diagonal we are located
+                if (dim_block + threadIdx_y < global_j_idx_linear) {
+                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                } else {
+                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                }
+                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
             }
-
-            B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i];
-            B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
-
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
+    // calculate the indices used in the current thread
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
     // apply the (partial) BLAS operation and update C
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-            const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
-
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && device_global_j < device_specific_num_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i];
+            // calculate the indices to access the global data and the data with respect to the current device
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+            const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -118,8 +119,8 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
  * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -128,68 +129,72 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_symm_mirror(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long num_mirror_rows, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # rhs -> num_rhs
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # rows -> num_mirror_rows
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_i = i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_j = j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y - 1ull) * (dim + threadIdx_y) / 2ull + device_specific_num_rows - (dim + threadIdx_y) + global_j];
-            A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - 1ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) / 2ull + device_specific_num_rows - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) + global_j];
-            B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i];
-            B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
-
-        // perform the feature reduction calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_mirror_rows
+
+        // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                               // SoA
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
+    // calculate the indices used in the current thread
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
+
     // apply the (remaining) BLAS operation and update C
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto partial_global_j = j + static_cast<unsigned long long>(internal_j);
-            const auto global_j = row_offset + device_specific_num_rows + j + static_cast<unsigned long long>(internal_j);
-
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && partial_global_j < num_mirror_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i];
+            // calculate the indices to access the global data and the data with respect to the current device
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+            const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
+
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -203,27 +208,29 @@ __global__ void device_kernel_symm_mirror(const unsigned long long num_rows, con
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_inplace_matrix_add(const unsigned long long num_cols, real_type *lhs, const real_type *rhs, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # num_rows
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # num_rhs
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto global_j = j + static_cast<unsigned long long>(internal_j);
+            // calculate the indices to access the global data
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j];
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx];  // SoA
         }
     }
 }
@@ -236,27 +243,29 @@ __global__ void device_kernel_inplace_matrix_add(const unsigned long long num_co
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_inplace_matrix_scale(const unsigned long long num_cols, real_type *lhs, const real_type scale, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # num_rows
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # num_rhs
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto global_j = j + static_cast<unsigned long long>(internal_j);
+            // calculate the indices to access the global data
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] *= scale;
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale;  // SoA
         }
     }
 }
diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp
index 75a3cd9a5..308867d76 100644
--- a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp
@@ -14,23 +14,25 @@
 #pragma once
 
 #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp"  // plssvm::hip::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                     // plssvm::kernel_function_type
 
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::hip::detail {
 
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
- * @param[out] kernel_matrix_d the calculated kernel matrix
- * @param[in] data_d the data points to calculate the kernel matrix from
+ * @param[out] kernel_matrix the calculated kernel matrix
+ * @param[in] data the data points to calculate the kernel matrix from
  * @param[in] num_rows the total number of data points (= total number of rows)
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] q the vector used in the dimensional reduction
  * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -40,80 +42,84 @@ namespace plssvm::hip::detail {
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type *q, const real_type QA_cost, const real_type cost, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a wavefront must progress further
     if (blockIdx_x >= blockIdx_y) {
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-                const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-                data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-            }
-            __syncthreads();  // wait until all threads loaded their part of the data
-
-            // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+            const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into shared memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the shared memory
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                }
+                __syncthreads();  // wait until all threads loaded their part of the data
+
+                // perform the feature reduction calculation
+                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
                     }
                 }
+                __syncthreads();  // wait until all threads performed their part of the calculations
             }
-            __syncthreads();  // wait until all threads performed their part of the calculations
         }
 
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                // calculate the indices to access the kernel matrix (the part stored on the current device)
-                const auto device_global_i = i + static_cast<unsigned long long>(internal_i);
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
-
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                     real_type temp_ij = temp[internal_i][internal_j];
-                    temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j];
+                    // apply the final kernel function
+                    temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp_ij += cost;
                     }
-                    // update the kernel matrix
-                    kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ull) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij;
+                    // update the upper triangular kernel matrix
+                    kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                 }
             }
         }
diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
index 77820e35a..b2bee8d46 100644
--- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
@@ -14,12 +14,14 @@
 #pragma once
 
 #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp"  // plssvm::hip::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                     // plssvm::kernel_function_type
 
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::hip::detail {
 
 /**
@@ -28,10 +30,10 @@ namespace plssvm::hip::detail {
  * @tparam Args the types of the parameters necessary for the specific kernel function
  * @param[in] alpha the scalar alpha value
  * @param[in] q the vector used in the dimensional reduction
- * @param[in] data_d the data points to calculate the implicit kernel matrix from
+ * @param[in] data the data points to calculate the implicit kernel matrix from
  * @param[in] num_rows the total number of data points (= total number of rows)
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] QA_cost the scalar used in the dimensional reduction
  * @param[in] cost the cost factor the diagonal is scaled with
@@ -43,56 +45,64 @@ namespace plssvm::hip::detail {
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const unsigned long long num_classes, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
+    // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+    const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+    const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a wavefront must progress further
     if (blockIdx_x >= blockIdx_y) {
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
+        //*************************************************************************//
+        //                   inplace kernel matrix construction                    //
+        //*************************************************************************//
         {
-            // create the shared memory arrays used for caching data point features
-            __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-            __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            auto data_i_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+            auto data_j_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-                    const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                    data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                    data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-                    data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the shared memory
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                    data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
                         }
                     }
                 }
@@ -103,16 +113,18 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto device_global_i = i + static_cast<unsigned long long>(internal_i);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
-                const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) {
-                    temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                    // apply the final kernel function
+                    temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp[internal_i][internal_j] += cost;
                     }
                 } else {
@@ -122,42 +134,44 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
             }
         }
 
-        // calculate C += alpha * temp * B for the UPPER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+        //*************************************************************************//
         {
-            // same shared memory size but with different dimensions
-            __shared__ real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
-            __shared__ real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+            auto B_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_one);
+            auto C_out_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y];
-                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 };
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                    // store the values in the shared memory
+                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
+                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 };                                                                          // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE] +=
-                                temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE];
+                            C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE] +=
+                                temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE];
                         }
                     }
                     __syncthreads();  // wait until all threads performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_j = row_offset + j + static_cast<unsigned long long>(internal);
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]);
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x + THREAD_BLOCK_SIZE]);
+                    // calculate the indices to access the global data
+                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal);
+
+                    atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]);  // SoA
                 }
                 __syncthreads();  // wai until all threads updated C with their values
             }
@@ -166,51 +180,55 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
         // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
-                if (global_i == global_j) {
+                // update the diagonal
+                if (global_i_idx == global_j_idx) {
                     temp[internal_i][internal_j] = real_type{ 0.0 };
                 }
             }
         }
 
-        // calculate C += alpha * temp * B for the LOWER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+        //*************************************************************************//
         {
-            // same shared memory size but with different dimensions
-            __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-            __shared__ real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            auto B_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+            auto C_out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y];
-                    B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
+                    // store the values in the shared memory
+                    B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
                     C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
-                    C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] +=
-                                temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j];
+                            C_out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] +=
+                                temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j];
                         }
                     }
                     __syncthreads();  // wait until all threads performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i + static_cast<unsigned long long>(internal);
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal);
+
+                    atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);  // SoA
                 }
                 __syncthreads();  // wait until all threads updated C with their values
             }
diff --git a/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp b/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp
index a98bb0715..1b2be0ae6 100644
--- a/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp
@@ -51,42 +51,17 @@ template <>
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] __device__ __forceinline__ real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
index 6e349927e..9ee22edc4 100644
--- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
@@ -14,169 +14,183 @@
 #pragma once
 
 #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp"  // plssvm::hip::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                     // plssvm::kernel_function_type
 
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::hip::detail {
 
 /**
  * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
- * @param[out] w_d the vector to speedup the linear prediction
- * @param[in] alpha_d the previously learned weights
- * @param[in] sv_d the support vectors
+ * @param[out] w the vector to speedup the linear prediction
+ * @param[in] alpha the previously learned weights
+ * @param[in] support_vectors the support vectors
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+ * @param[in] device_num_sv the number of support vectors the current device is responsible for
+ * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long device_specific_num_sv, const unsigned long long sv_offset, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_feature_idx = feature_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_class_idx = class_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_features
+        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;    // num_classes
 
-            data_cache_feature[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ull) + sv + threadIdx_y];  // SoA
-            data_cache_alpha[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ull) + sv + sv_offset + threadIdx_y];       // AoS
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_feature][internal_class] += data_cache_alpha[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature];
+                // store the values in the shared memory
+                feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y];  // SoA
+                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y];    // AoS
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_feature][internal_class] += alpha_cache[sv][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
-    // update global array with local one
+    // calculate the indices used in the current thread
+    const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
+
+    // update the global w-vector with the locally cached values
     for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-            const auto global_feature_idx = feature_idx + static_cast<unsigned long long>(internal_feature);
-            const auto global_class_idx = class_idx + static_cast<unsigned long long>(internal_class);
+            // calculate the indices to access the global data
+            const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-            w_d[global_feature_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_feature][internal_class];
+            w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
         }
     }
 }
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
- * @param[out] prediction_d the predicted values
- * @param[in] w_d the vector to speedup the calculations
- * @param[in] rho_d the previously learned bias
- * @param[in] predict_points_d the data points to predict
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
+ * @param[out] prediction the predicted values
+ * @param[in] w the vector to speedup the calculations
+ * @param[in] rho the previously learned bias
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_predict_points the number of data points to predict
  * @param[in] num_features the number of features per data point
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_pp_idx = pp_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_class_idx = class_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-            data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-            data_cache_w[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ull) + global_class_idx];
-            data_cache_w[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_classes + PADDING_SIZE_ull) + global_class_idx];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;     // num_predict_points
+        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_classes
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_pd][internal_class] += data_cache_w[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd];
+                // store the values in the shared memory
+                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_pp][internal_class] += w_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
-    // update global array with local one
-    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    // calculate the indices used in the current thread
+    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+    // update the global array with the local one
+    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-            const auto global_pp_idx = pp_idx + static_cast<unsigned long long>(internal_pd);
-            const auto global_class_idx = class_idx + static_cast<unsigned long long>(internal_class);
+            // calculate the indices to access the global data
+            const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-            prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx];
+            prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx];  // AoS
         }
     }
 }
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
- * @param[in] prediction_d the predicted values
- * @param[in] alpha_d the previously learned weights
- * @param[in] rho_d the previously learned biases
- * @param[in] sv_d the support vectors
- * @param[in] predict_points_d the data points to predict
+ * @param[in] prediction the predicted values
+ * @param[in] alpha the previously learned weights
+ * @param[in] rho the previously learned biases
+ * @param[in] support_vectors the support vectors
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
  * @param[in] num_predict_points the number of data points to predict
@@ -186,53 +200,55 @@ __global__ void device_kernel_predict_linear(real_type *prediction_d, const real
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
+__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
     {
-        // create the shared memory arrays used for caching data point features
-        __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        __shared__ real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        auto pp_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+        auto sv_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
+
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_predict_points
+        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
 
         // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE;
-                const auto global_sv_idx = sv_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-                data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-                data_cache_sv[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
-                data_cache_sv[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];                          // SoA
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                  data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]);
+                        temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                  pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]);
                     }
                 }
             }
@@ -241,55 +257,57 @@ __global__ void device_kernel_predict(real_type *prediction_d, const real_type *
     }
 
     // update temp using the respective kernel function
-    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-            temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter...);
+            temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter...);
         }
     }
 
     {
-        // same shared memory size but with different dimensions
-        __shared__ real_type alpha_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        __shared__ real_type out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        auto alpha_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+        auto out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+        // calculate the indices used in the current thread
+        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+        // iterate over all classes using blocking to be able to cache them for faster memory accesses
+        for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const unsigned long long global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
-                alpha_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                // store the values in the shared memory
+                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                 // the bias (rho) must only be applied once for all support vectors
-                if (blockIdx_y == 0ull) {
-                    out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y];
-                    out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
+                if (blockIdx_y == std::size_t{ 0 }) {
+                    out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y];
                 } else {
                     out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
-                    out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
                 }
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // calculate intermediate results and store them in shared memory
-            for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + threadIdx.x] +=
-                            temp[internal_pd][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv];
+                        out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] +=
+                            temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv];
                     }
                 }
                 __syncthreads();  // wait until all threads performed their part of the calculations
             }
 
-            // add intermediate cached results to prediction_d
+            // atomically add the intermediate cached results to the prediction
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx + static_cast<unsigned long long>(internal);
+                // calculate the indices to access the global data
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
+                atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
             }
             __syncthreads();  // wait until all threads updated their part of the prediction
         }
diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
index 20cbad247..579a56715 100644
--- a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
@@ -34,60 +34,63 @@ namespace plssvm::hpx::detail {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a symmetric matrix (memory optimized), @p B and @p C are matrices, and @p alpha and @p beta are scalars.
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
  * @param[in] beta the scalar beta value
  * @param[in,out] C the matrix @p C, also used as result matrix
  */
-inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
+inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+    PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
 
     // calculate constants
     const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_num_rhs * blocked_device_specific_num_rows);  // define range over which should be iterated
+    // define the range over which should be iterated
+    std::vector<std::size_t> range(blocked_num_rhs * blocked_device_num_rows);
     std::iota(range.begin(), range.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t rhs = idx / blocked_device_specific_num_rows;
-        const std::size_t row = idx % blocked_device_specific_num_rows;
-
-        const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
 
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all features
-        for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) {
+        // iterate over all values using blocking
+        for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
             // perform the dot product calculation
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                    const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                    real_type A_val = 0.0;
-                    // determine on which side of the diagonal we are located
-                    if (dim < global_row) {
-                        A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }];
-                    } else {
-                        A_val = A[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                    real_type sum{ 0.0 };
+                    for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                        real_type A_cache = 0.0;
+                        // determine on which side of the diagonal we are located
+                        if (dim_block + dim < global_j_idx) {
+                            A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                        } else {
+                            A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                        }
+                        sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset);
                     }
-                    temp[internal_i][internal_j] += A_val * B(global_rhs, dim + row_offset);
+                    temp[internal_j][internal_i] += sum;
                 }
             }
         }
@@ -95,13 +98,14 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
         // apply the (partial) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) {
-                    C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row);
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                    C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx);
                 }
             }
         }
@@ -113,70 +117,74 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
  * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
  * @param[in] beta the scalar beta value
  * @param[in,out] C the matrix @p C, also used as result matrix
  */
-inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
     // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
+    PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
     PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
 
     // calculate constants
     const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
     const auto blocked_num_mirror_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_mirror_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_num_rhs * blocked_num_mirror_rows);  // define range over which should be iterated
+    // define the range over which should be iterated
+    std::vector<std::size_t> range(blocked_num_rhs * blocked_num_mirror_rows);
     std::iota(range.begin(), range.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t rhs = idx / blocked_num_mirror_rows;
-        const std::size_t row = idx % blocked_num_mirror_rows;
-
-        const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t i_idx = (idx / blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j_idx = (idx % blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz;
 
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all features
-        for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) {
+        // iterate over the remaining values using blocking
+        for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
             // perform the dot product calculation
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                    const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                    const real_type A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row];
-                    temp[internal_i][internal_j] += A_val * B(global_rhs, row_offset + dim);
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                    real_type sum{ 0.0 };
+                    for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                        sum += A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                               B(global_i_idx, dim_block + dim + device_row_offset);                                                                                                                                                         // SoA
+                    }
+                    temp[internal_j][internal_i] += sum;
                 }
             }
         }
 
-        // apply the (partial) BLAS operation and update C
+        // apply the (remaining) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                const std::size_t partial_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) {
-                    C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row);
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                    C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx);
                 }
             }
         }
diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
index e575c6af2..f1cf4723e 100644
--- a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -15,7 +15,7 @@
 #pragma once
 
 #include "plssvm/backends/HPX/kernel/kernel_functions.hpp"  // plssvm::hpx::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                             // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                             // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                 // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                                // plssvm::aos_matrix
@@ -32,82 +32,89 @@
 namespace plssvm::hpx::detail {
 
 /**
- * @brief Assemble the kernel matrix using the @p kernel function.
- * @tparam kernel the compile-time kernel function to use
- * @tparam Args the types of the potential additional arguments for the @p kernel function
+ * @brief Assemble the kernel matrix using the @p kernel_function function.
+ * @tparam kernel_function the compile-time kernel function to use
+ * @tparam Args the types of the potential additional arguments for the @p kernel_function function
  * @param[out] kernel_matrix the resulting kernel matrix
  * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] q the `q` vector
  * @param[in] QA_cost he bottom right matrix entry multiplied by cost
  * @param[in] cost 1 / the cost parameter in the C-SVM
- * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
+ * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function
  */
-template <kernel_function_type kernel, typename... Args>
-void device_kernel_assembly(std::vector<real_type> &kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+    PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!");
     PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!");
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
+    PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+    PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
     PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
 
     // calculate constants
     const std::size_t num_rows = data.num_rows() - 1;
     const std::size_t num_features = data.num_cols();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-    // count the number of entries in the final index list
-    std::vector<std::size_t> indices(blocked_row_range * blocked_device_specific_num_rows);  // define range over which should be iterated
+    // define the range over which should be iterated
+    std::vector<std::size_t> indices(blocked_row_range * blocked_device_num_rows);
     std::iota(indices.begin(), indices.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, indices.cbegin(), indices.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
 
         // only calculate the upper triangular matrix
-        if (row_idx >= col_idx) {
-            // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs
+        if (i_idx >= j_idx) {
             // create a thread private array used for internal caching
             std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
             // iterate over all features
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // perform the feature reduction calculation
-                for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                    for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                        const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                        const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                        temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data(global_row, dim), data(global_col, dim));
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            sum += detail::feature_reduce<kernel_function>(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature));
+                        }
+                        temp[internal_j][internal_i] += sum;
                     }
                 }
             }
 
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
-            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                    // calculate the indices to access the kernel matrix (the part stored on the current device)
-                    const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                        real_type temp_ij = temp[internal_row][internal_col];
-                        temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col];
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                        real_type temp_ij = temp[internal_j][internal_i];
+                        // apply the final kernel function
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_row == global_col) {
+                        if (global_i_idx == global_j_idx) {
                             temp_ij += cost;
                         }
-                        kernel_matrix[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij;
+                        // update the upper triangular kernel matrix
+                        kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                     }
                 }
             }
diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 06df89dac..7b8d79e1b 100644
--- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -34,25 +34,25 @@
 namespace plssvm::hpx::detail {
 
 /**
- * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
- * @tparam kernel the compile-time kernel function to use
- * @tparam Args the types of the potential additional arguments for the @p kernel function
+ * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @tparam kernel_function the compile-time kernel function to use
+ * @tparam Args the types of the potential additional arguments for the @p kernel_function function
  * @param[in] alpha the scalar alpha value
  * @param[in] q the `q` vector
  * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] QA_cost he bottom right matrix entry multiplied by cost
  * @param[in] cost 1 / the cost parameter in the C-SVM
  * @param[in] B the matrix @p B
  * @param[in,out] C the matrix @p C
- * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
+ * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_assembly_symm(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+inline void device_kernel_assembly_symm(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
     PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
+    PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+    PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
     PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
     PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!");
     PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size());
@@ -61,64 +61,92 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
     const std::size_t num_rows = data.num_rows() - 1;
     const std::size_t num_features = data.num_cols();
     const std::size_t num_classes = B.num_rows();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
-    // count the number of entries in the final index list
-    std::vector<std::size_t> indices(blocked_row_range * blocked_device_specific_num_rows);  // define range over which should be iterated
+    // define the range over which should be iterated
+    std::vector<std::size_t> indices(blocked_row_range * blocked_device_num_rows);
     std::iota(indices.begin(), indices.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, indices.cbegin(), indices.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
 
         // only calculate the upper triangular matrix
-        if (row_idx >= col_idx) {
-            // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs
+        if (i_idx >= j_idx) {
             // create a thread private array used for internal caching
             std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-            // iterate over all features
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
-                for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                    for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                        const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                        const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                        temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data(global_row, dim), data(global_col, dim));
+            //*************************************************************************//
+            //                   inplace kernel matrix construction                    //
+            //*************************************************************************//
+            // iterate over all features using blocking
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            sum += detail::feature_reduce<kernel_function>(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature));
+                        }
+                        temp[internal_j][internal_i] += sum;
                     }
                 }
             }
 
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
-            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                    const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                        real_type temp_ij = temp[internal_row][internal_col];
-                        temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col];
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                        // apply the final kernel function
+                        temp[internal_j][internal_i] = detail::apply_kernel_function<kernel_function>(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_row == global_col) {
-                            temp_ij += cost;
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
-                                atomic_ref<real_type>{ C(class_idx, global_row) } += alpha * temp_ij * B(class_idx, global_row);
+                        if (global_i_idx == global_j_idx) {
+                            temp[internal_j][internal_i] += cost;
+                        }
+                    } else {
+                        // be sure to set the value to zero otherwise
+                        temp[internal_j][internal_i] = real_type{ 0.0 };
+                    }
+                }
+            }
+
+            //*************************************************************************//
+            //                     calculate C += alpha * temp * B                     //
+            //*************************************************************************//
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                        if (global_i_idx == global_j_idx) {
+                            // only apply once to the diagonal
+                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                atomic_ref<real_type>{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);
                             }
                         } else {
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
-                                atomic_ref<real_type>{ C(class_idx, global_row) } += alpha * temp_ij * B(class_idx, global_col);
+                            // apply it for the upper and lower triangular matrix
+                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                atomic_ref<real_type>{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx);
                                 // symmetry
-                                atomic_ref<real_type>{ C(class_idx, global_col) } += alpha * temp_ij * B(class_idx, global_row);
+                                atomic_ref<real_type>{ C(class_block + class_idx, global_j_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);
                             }
                         }
                     }
diff --git a/include/plssvm/backends/HPX/kernel/kernel_functions.hpp b/include/plssvm/backends/HPX/kernel/kernel_functions.hpp
index 6c0cd8a43..35e79d01d 100644
--- a/include/plssvm/backends/HPX/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/HPX/kernel/kernel_functions.hpp
@@ -28,42 +28,17 @@ namespace plssvm::hpx::detail {
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] inline real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
index 7ea68e172..d5e811c63 100644
--- a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
@@ -16,7 +16,7 @@
 
 #include "plssvm/backends/HPX/detail/utility.hpp"           // plssvm::hpx::detail::atomic_ref
 #include "plssvm/backends/HPX/kernel/kernel_functions.hpp"  // plssvm::hpx::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                             // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                             // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                 // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                                // plssvm::aos_matrix, plssvm::soa_matrix
@@ -38,59 +38,63 @@ namespace plssvm::hpx::detail {
  * @param[out] w the vector to speedup the linear prediction
  * @param[in] alpha the previously learned weights
  * @param[in] support_vectors the support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for
+ * @param[in] device_num_sv the number of support vectors the current device is responsible for
+ * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for
  */
-inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) {
+inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) {
     PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
     PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }));
-    PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows());
-    PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows());
+    PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows());
+    PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows());
 
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
-    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_features = support_vectors.num_cols();
     const auto blocked_num_features = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_features) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
-    // define range over which should be iterated
+    // define the range over which should be iterated
     std::vector<std::size_t> range(blocked_num_classes * blocked_num_features);
     std::iota(range.begin(), range.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t feature = idx / blocked_num_classes;
-        const std::size_t c = idx % blocked_num_classes;
-
-        const std::size_t feature_idx = feature * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t feature_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
 
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all features
-        for (std::size_t sv = 0; sv < device_specific_num_sv; ++sv) {
-            // perform the feature reduction calculation
+        // iterate over all support vectors using blocking
+        for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
+            // perform the dot product calculation
             for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                 for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const std::size_t global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
-                    const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                    // calculate the indices to access the global data
+                    const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                    temp[internal_feature][internal_class] += alpha(global_class_idx, sv_offset + sv) * support_vectors(sv_offset + sv, global_feature_idx);
+                    real_type sum{ 0.0 };
+                    for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                        sum += alpha(global_class_idx, device_sv_offset + sv_block + sv) * support_vectors(device_sv_offset + sv_block + sv, global_feature_idx);
+                    }
+                    temp[internal_class][internal_feature] += sum;
                 }
             }
         }
 
-        // update global array with local one
+        // store the result back to the w vector
         for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const std::size_t global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
-                const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                // calculate the indices to access the global data
+                const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                w(global_class_idx, global_feature_idx) = temp[internal_feature][internal_class];
+                w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature];
             }
         }
     });
@@ -102,63 +106,64 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
  * @param[in] w the vector to speedup the calculations
  * @param[in] rho the previously learned bias
  * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
+ * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+ * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
  */
-inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) {
+inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) {
     PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size());
     PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols());
     PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
 
     // calculate constants
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_classes = prediction.num_cols();
-    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_features = predict_points.num_cols();
+    const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_device_specific_num_predict_points * blocked_num_classes);
+    // define the range over which should be iterated
+    std::vector<std::size_t> range(blocked_device_num_predict_points * blocked_num_classes);
     std::iota(range.begin(), range.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t pp = idx / blocked_num_classes;
-        const std::size_t c = idx % blocked_num_classes;
-
-        const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t pp_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
 
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all features
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
-            // perform the feature reduction calculation
+        // iterate over all features using blocking
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+            // perform the dot product calculation
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                    temp[internal_pp][internal_class] += w(global_class_idx, dim) * predict_points(global_pp_idx, dim);
+                    real_type sum{ 0.0 };
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        sum += w(global_class_idx, feature_block + feature) * predict_points(global_pp_idx, feature_block + feature);
+                    }
+                    temp[internal_class][internal_pp] += sum;
                 }
             }
         }
 
-        // perform the dot product calculation
+        // update the global array with the local one
         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const std::size_t device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                // calculate the indices to access the global data
+                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                if (device_global_pp_idx < device_specific_num_predict_points && global_class_idx < num_classes) {
-                    prediction(global_pp_idx, global_class_idx) = temp[internal_pp][internal_class] - rho[global_class_idx];
-                }
+                prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx];
             }
         }
     });
@@ -166,61 +171,63 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
 
 /**
  * @brief Predict the @p predict_points_d using the @p kernel_function.
- * @tparam kernel the type of the used kernel function
+ * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  * @param[out] prediction the predicted values
  * @param[in] alpha the previously learned weights
  * @param[in] rho the previously learned bias
  * @param[in] support_vectors the support vectors
  * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
+ * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+ * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) {
     PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size());
     PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
     PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols());
     PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
 
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
     const std::size_t num_support_vectors = support_vectors.num_rows();
-    const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_features = predict_points.num_cols();
+    const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_device_specific_num_predict_points * blocked_num_support_vectors);
+    // define the range over which should be iterated
+    std::vector<std::size_t> range(blocked_device_num_predict_points * blocked_num_support_vectors);
     std::iota(range.begin(), range.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t pp = idx / blocked_num_support_vectors;
-        const std::size_t sv = idx % blocked_num_support_vectors;
-
-        const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t sv_idx = sv * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t pp_idx = (idx / blocked_num_support_vectors) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t sv_idx = (idx % blocked_num_support_vectors) * INTERNAL_BLOCK_SIZE_uz;
 
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all features
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
+        // iterate over all features using blocking
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
             // perform the feature reduction calculation
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                    temp[internal_pp][internal_sv] += detail::feature_reduce<kernel>(support_vectors(global_sv_idx, dim),
-                                                                                     predict_points(global_pp_idx, dim));
+                    real_type sum{ 0.0 };
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        sum += detail::feature_reduce<kernel_function>(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature));
+                    }
+                    temp[internal_sv][internal_pp] += sum;
                 }
             }
         }
@@ -228,25 +235,23 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
         // update temp using the respective kernel function
         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel>(temp[internal_pp][internal_sv], kernel_function_parameter...);
+                temp[internal_sv][internal_pp] = detail::apply_kernel_function<kernel_function>(temp[internal_sv][internal_pp], kernel_function_parameter...);
             }
         }
 
-        // add results to prediction
-        for (std::size_t a = 0; a < num_classes; ++a) {
+        // atomically add the results to the prediction
+        for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const std::size_t device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                    // be sure to not perform out of bounds accesses
-                    if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) {
+                    for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
                         if (global_sv_idx == 0) {
-                            atomic_ref<real_type>{ prediction(global_pp_idx, a) } += -rho[a];
+                            atomic_ref<real_type>{ prediction(global_pp_idx, class_block + class_idx) } += -rho[class_block + class_idx];
                         }
-                        atomic_ref<real_type>{ prediction(global_pp_idx, a) } +=
-                            temp[internal_pp][internal_sv] * alpha(a, global_sv_idx);
+                        atomic_ref<real_type>{ prediction(global_pp_idx, class_block + class_idx) } += alpha(class_block + class_idx, global_sv_idx) * temp[internal_sv][internal_pp];
                     }
                 }
             }
diff --git a/include/plssvm/backends/Kokkos/csvm.hpp b/include/plssvm/backends/Kokkos/csvm.hpp
index 5a77ef1e1..3098a0d87 100644
--- a/include/plssvm/backends/Kokkos/csvm.hpp
+++ b/include/plssvm/backends/Kokkos/csvm.hpp
@@ -139,7 +139,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::devic
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
index 559c9e75c..724c9a611 100644
--- a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
+++ b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
@@ -42,9 +42,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::cuda) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::cuda) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::cuda) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::cuda) }
 #endif
 
 //***************************************************//
@@ -65,9 +65,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hip) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::hip) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hip) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::hip) }
 #endif
 
 //***************************************************//
@@ -88,9 +88,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::sycl) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::sycl) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::sycl) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::sycl) }
 #endif
 
 //***************************************************//
@@ -111,9 +111,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hpx) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::hpx) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hpx) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::hpx) }
 #endif
 
 //***************************************************//
@@ -134,9 +134,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openmp) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openmp) }
 #endif
 
 //***************************************************//
@@ -157,9 +157,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMPTARGET(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp_target) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openmp_target) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp_target) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openmp_target) }
 #endif
 
 //***************************************************//
@@ -180,9 +180,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENACC(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openacc) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openacc) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openacc) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openacc) }
 #endif
 
 //***************************************************//
@@ -203,9 +203,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::threads) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::threads) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::threads) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::threads) }
 #endif
 
 //***************************************************//
@@ -228,9 +228,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::serial) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::serial) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::serial) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::serial) }
 #endif
 
 }  // namespace plssvm::kokkos::detail
diff --git a/include/plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp b/include/plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp
new file mode 100644
index 000000000..d023c6651
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp
@@ -0,0 +1,48 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Function to list all available memory spaces at compile time.
+ * @note Must be a separate file such that the Kokkos header must not be included in the "execution_space.hpp" file.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_MEMORY_SPACES_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_MEMORY_SPACES_HPP_
+
+#include "plssvm/backends/Kokkos/memory_space.hpp"  // plssvm::kokkos::memory_space
+
+#include <array>  // std::array
+
+namespace plssvm::kokkos::detail {
+
+/**
+ * @brief List all available Kokkos::MemorySpaces at compile time.
+ * @details The `memory_space::host_space` is always available!
+ * @return a `std::array` containing all available memory spaces (`[[nodiscard]]`)
+ */
+[[nodiscard]] inline constexpr auto constexpr_available_memory_spaces() noexcept {
+    // Note: the trailing comma is explicitly allowed by the standard
+    return std::array{
+        memory_space::host_space,
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_CUDA)
+        memory_space::cuda_space,
+        memory_space::cuda_usm_space,
+#endif
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HIP)
+        memory_space::hip_space,
+        memory_space::hip_usm_space,
+#endif
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_SYCL)
+        memory_space::sycl_space,
+        memory_space::sycl_usm_space,
+#endif
+    };
+}
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_MEMORY_SPACES_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
index ad067d00b..19cc9cb60 100644
--- a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
@@ -34,6 +34,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, device_wrapper, de
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -60,21 +61,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, device_wrapper, de
      * @brief Allocates `size * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] device the device wrapper
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    explicit device_ptr(size_type size, const device_wrapper &device);
+    explicit device_ptr(size_type size, const device_wrapper &device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] device the device wrapper
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    explicit device_ptr(plssvm::shape shape, const device_wrapper &device);
+    explicit device_ptr(plssvm::shape shape, const device_wrapper &device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] device the device wrapper
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const device_wrapper &device);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const device_wrapper &device, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
index ea60bb1fd..51c739585 100644
--- a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
@@ -12,13 +12,14 @@
 #ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
 
-#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_execution_spaces
-#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"                        // plssvm::kokkos::detail::device_wrapper
-#include "plssvm/backends/Kokkos/execution_space.hpp"                              // plssvm::kokkos::execution_space
-#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp"                  // plssvm::kokkos::execution_space_to_kokkos_type_t
-#include "plssvm/detail/type_traits.hpp"                                           // plssvm::detail::remove_cvref_t
+#include "plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_memory_spaces
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"                     // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/execution_space.hpp"                           // plssvm::kokkos::execution_space
+#include "plssvm/backends/Kokkos/memory_space.hpp"                              // plssvm::kokkos::memory_space
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"                  // plssvm::kokkos::{memory_space_to_kokkos_type_t, kokkos_execution_space_to_kokkos_memory_space_t}
+#include "plssvm/detail/type_traits.hpp"                                        // plssvm::detail::remove_cvref_t
 
-#include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::ExecutionSpace
+#include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::MemorySpace
 
 #include <array>       // std::array
 #include <cstddef>     // std::size_t
@@ -38,27 +39,27 @@ struct create_view_variant_type_helper;
 
 /**
  * @brief Helper struct to create a `std::variant` containing all available Kokkos::View types by iterating over the `std::array` of
- *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ *        `plssvm::kokkos::memory_space` values as returned by `plssvm::kokkos::detail::constexpr_available_memory_spaces()`.
  * @tparam T the value type of the underlying Kokkos::View
  * @tparam Is the indices to index the `std::array`
  */
 template <typename T, std::size_t... Is>
 struct create_view_variant_type_helper<T, std::index_sequence<Is...>> {
-    /// The array containing all available execution spaces.
-    constexpr static auto array = detail::constexpr_available_execution_spaces();
+    /// The array containing all available memory spaces.
+    constexpr static auto array = detail::constexpr_available_memory_spaces();
     /// The resulting variant type.
-    using type = std::variant<Kokkos::View<T, execution_space_to_kokkos_type_t<array[Is]>>...>;
+    using type = std::variant<Kokkos::View<T, memory_space_to_kokkos_type_t<array[Is]>>...>;
 };
 
 /**
  * @brief Create a `std::variant` containing all available Kokkos::View types by iterating over the `std::array` of
- *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ *        `plssvm::kokkos::memory_space` values as returned by `plssvm::kokkos::detail::constexpr_available_memory_spaces()`.
  * @tparam T the value type of the underlying Kokkos::View
  */
 template <typename T>
 struct create_view_variant_type {
     /// The number of types in the final variant.
-    constexpr static std::size_t N = detail::constexpr_available_execution_spaces().size();
+    constexpr static std::size_t N = detail::constexpr_available_memory_spaces().size();
     /// The final variant type.
     using type = typename create_view_variant_type_helper<T, std::make_index_sequence<N>>::type;
 };
@@ -82,37 +83,49 @@ class device_view_wrapper {
 
     /**
      * @brief Construct the wrapper using the provided Kokkos::View instance by forwarding its value to the underlying `std::variant`.
-     * @tparam ExecutionSpace the used Kokkos::ExecutionSpace type of the Kokkos::View
+     * @tparam MemorySpace the used Kokkos::MemorySpace type of the Kokkos::View
      * @param[in] view the Kokkos::View instance
      */
-    template <typename ExecutionSpace>
-    explicit device_view_wrapper(Kokkos::View<T, ExecutionSpace> &&view) :
-        v_{ std::move(view) } { }
+    template <typename MemorySpace>
+    explicit device_view_wrapper(Kokkos::View<T, MemorySpace> &&view, const bool use_usm_allocations = false) :
+        v_{ std::move(view) },
+        use_usm_allocations_{ use_usm_allocations } { }
 
     /**
      * @brief Given the provided `execution_space` enum value, tries to get the `std::variant` alternative for the corresponding Kokkos::ExecutionSpace type.
      * @tparam space the `execution_space` enum value
+     * @tparam use_usm_allocations if `true` use USM allocations
      * @return the Kokkos::View instance (`[[nodiscard]]`)
      */
-    template <execution_space space>
-    [[nodiscard]] Kokkos::View<T, execution_space_to_kokkos_type_t<space>> &get() {
-        return std::get<Kokkos::View<T, execution_space_to_kokkos_type_t<space>>>(v_);
+    template <execution_space space, bool use_usm_allocations = false>
+    [[nodiscard]] auto &get() {
+        constexpr memory_space mem_space = execution_space_to_memory_space_v<space, use_usm_allocations>;
+        return std::get<Kokkos::View<T, memory_space_to_kokkos_type_t<mem_space>>>(v_);
     }
 
     /**
      * @copydoc plssvm::kokkos::detail::device_view_wrapper::get
      */
-    template <execution_space space>
-    [[nodiscard]] const Kokkos::View<T, execution_space_to_kokkos_type_t<space>> &get() const {
-        return std::get<Kokkos::View<T, execution_space_to_kokkos_type_t<space>>>(v_);
+    template <execution_space space, bool use_usm_allocations = false>
+    [[nodiscard]] const auto &get() const {
+        constexpr memory_space mem_space = execution_space_to_memory_space_v<space, use_usm_allocations>;
+        return std::get<Kokkos::View<T, memory_space_to_kokkos_type_t<mem_space>>>(v_);
     }
 
     /**
-     * @brief Return the `execution_space` enum value of the currently active `std::variant` Kokkos::View type.
-     * @return the `execution_space` enum value (`[[nodiscard]]`)
+     * @brief Return the `memory_space` enum value of the currently active `std::variant` Kokkos::View type.
+     * @return the `memory_space` enum value (`[[nodiscard]]`)
      */
-    [[nodiscard]] execution_space get_execution_space() const noexcept {
-        return detail::constexpr_available_execution_spaces()[v_.index()];
+    [[nodiscard]] constexpr memory_space get_memory_space() const noexcept {
+        return detail::constexpr_available_memory_spaces()[v_.index()];
+    }
+
+    /**
+     * @brief Check whether USM allocations are used.
+     * @return `true` if USM allocations are used, `false` otherwise (`[[nodiscard]]`)
+     */
+    [[nodiscard]] bool uses_usm_allocations() const noexcept {
+        return use_usm_allocations_;
     }
 
     /**
@@ -164,6 +177,8 @@ class device_view_wrapper {
   private:
     /// The wrapped `std::variant` type.
     variant_type v_;
+    /// `true` if USM allocations and, therefore, other Kokkos::MemorySpaces, are used.
+    bool use_usm_allocations_;
 };
 
 /**
@@ -171,14 +186,20 @@ class device_view_wrapper {
  * @tparam T the value type of the underlying Kokkos::View
  * @param[in] device the device for which this view should be allocated
  * @param[in] size the size of the Kokkos::View (number of elements **not** byte!)
- * @return a Kokkos::View wrapper where the active member of the internal `std::variant` corresponds to the Kokkos::View in the Kokkos::ExecutionSpace specified by @p space (`[[nodiscard]]`)
+ * @param[in] use_usm_allocations decide whether a USM memory space should be used or not
+ * @return a Kokkos::View wrapper where the active member of the internal `std::variant` corresponds to the Kokkos::View in the Kokkos::MemorySpace based on the requested Kokkos::ExecutionSpace and @p use_usm_allocations (`[[nodiscard]]`)
  */
 template <typename T>
-[[nodiscard]] device_view_wrapper<T> make_device_view_wrapper(const device_wrapper &device, const std::size_t size) {
+[[nodiscard]] device_view_wrapper<T> make_device_view_wrapper(const device_wrapper &device, const std::size_t size, const bool use_usm_allocations) {
     return device.execute_and_return([&](const auto &value) {
+        // get the Kokkos execution space
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(value)>;
-
-        return device_view_wrapper{ Kokkos::View<T, kokkos_execution_space_type>{ Kokkos::view_alloc(value, "device_ptr_view"), size } };
+        // check whether we want to use USM allocations or not
+        if (use_usm_allocations) {
+            return device_view_wrapper{ Kokkos::View<T, kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, true>>{ Kokkos::view_alloc(value, "usm_device_ptr_view"), size }, use_usm_allocations };
+        } else {
+            return device_view_wrapper{ Kokkos::View<T, kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, false>>{ Kokkos::view_alloc(value, "device_ptr_view"), size }, use_usm_allocations };
+        }
     });
 }
 
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
index bddadac01..1cff7f721 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"  // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t
+#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"                          // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
 
@@ -24,22 +26,29 @@ namespace plssvm::kokkos::detail {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam USMEnabledMemorySpace the Kokkos::MemorySpace that may use USM allocations
+ * @tparam target the target platform
  */
-template <typename ExecutionSpace>
+template <typename ExecutionSpace, typename USMEnabledMemorySpace, target_platform target>
 class device_kernel_symm {
+    /**
+     * @brief The type of the used Kokkos::View that may use USM allocations.
+     */
+    template <typename T>
+    using usm_device_view_type = Kokkos::View<T *, USMEnabledMemorySpace>;  // possible USM allocations
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -49,11 +58,11 @@ class device_kernel_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
-    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, usm_device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -69,79 +78,96 @@ class device_kernel_symm {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
-        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
 
-        // calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
-        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
-        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-
-        // create the shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size * sizeof(real_type)));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> A_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // create two scratchpad memory arrays used for caching
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> A_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_sz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+            const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+            // iterate over all values using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the scratchpad memory
+                    // determine on which side of the diagonal we are located
+                    if (dim_block + threadIdx_y < global_j_idx_linear) {
+                        A_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = A_[(dim_block + threadIdx_y) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                    } else {
+                        A_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                    }
 
-                // determine on which side of the diagonal we are located
-                if (dim + threadIdx_y < global_j) {
-                    A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];
-                } else {
-                    A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + dim + threadIdx_y - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    B_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = B_[(dim_block + device_row_offset_ + threadIdx_y) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
                 }
-                // determine on which side of the diagonal we are located
-                if (dim + threadIdx_y + THREAD_BLOCK_SIZE < global_j) {
-                    A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz + std::size_t{ 1 }) / std::size_t{ 2 }];
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i);
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
                 } else {
-                    A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
-                }
-
-                B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(dim + row_offset_ + threadIdx_y) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
-                B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(dim + row_offset_ + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
-            }
-            team.team_barrier();  // wait until all threads loaded their part of the data
-
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += A_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i);
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i);
+                            }
+                        }
                     }
                 }
+                team.team_barrier();  // wait until all threads performed their part of the calculations
             }
-            team.team_barrier();  // wait until all threads performed their part of the calculations
         }
 
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
         // apply the (partial) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -151,10 +177,10 @@ class device_kernel_symm {
     /// @cond Doxygen_suppress
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
-    device_view_type<const real_type> A_;
+    usm_device_view_type<const real_type> A_;
     device_view_type<const real_type> B_;
     const real_type beta_;
     device_view_type<real_type> C_;
@@ -168,14 +194,21 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam USMEnabledMemorySpace the Kokkos::MemorySpace that may use USM allocations
+ * @tparam target the target platform
  */
-template <typename ExecutionSpace>
+template <typename ExecutionSpace, typename USMEnabledMemorySpace, target_platform target>
 class device_kernel_symm_mirror {
+    /**
+     * @brief The type of the used Kokkos::View that may use USM allocations.
+     */
+    template <typename T>
+    using usm_device_view_type = Kokkos::View<T *, USMEnabledMemorySpace>;  // possible USM allocations
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
@@ -183,8 +216,8 @@ class device_kernel_symm_mirror {
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
      * @param[in] num_mirror_rows the number of rows to mirror down
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -194,12 +227,12 @@ class device_kernel_symm_mirror {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
-    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, usm_device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -215,69 +248,90 @@ class device_kernel_symm_mirror {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
-        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-        // calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
-        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
-        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
 
-        // create the shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size * sizeof(real_type)));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> A_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // create two shared memory arrays used for caching
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> A_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += FEATURE_BLOCK_SIZE_sz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_y) + global_j];
-                A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz - std::size_t{ 1 }) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) + global_j];
-                B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(row_offset_ + dim + threadIdx_y) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
-                B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(row_offset_ + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
-            }
-            team.team_barrier();  // wait until all threads loaded their part of the data
-
-            // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += A_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i);
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+            const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_mirror_rows
+
+            // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the scratchpad memory
+                    A_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = A_[(dim_block + threadIdx_y) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_y) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                    B_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = B_[(device_row_offset_ + dim_block + threadIdx_y) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                                // SoA
+                }
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i);
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i);
+                            }
+                        }
                     }
                 }
+                team.team_barrier();  // wait until all threads performed their part of the calculations
             }
-            team.team_barrier();  // wait until all threads performed their part of the calculations
         }
 
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
+
         // apply the (remaining) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto partial_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -288,10 +342,10 @@ class device_kernel_symm_mirror {
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
     const std::size_t num_mirror_rows_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
-    device_view_type<const real_type> A_;
+    usm_device_view_type<const real_type> A_;
     device_view_type<const real_type> B_;
     const real_type beta_;
     device_view_type<real_type> C_;
@@ -311,7 +365,7 @@ class device_kernel_inplace_matrix_add {
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
@@ -338,26 +392,28 @@ class device_kernel_inplace_matrix_add {
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-        // Calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // num_rows
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // num_rhs
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j];
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx];  // SoA
             }
         }
     }
@@ -383,7 +439,7 @@ class device_kernel_inplace_matrix_scale {
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
@@ -410,26 +466,28 @@ class device_kernel_inplace_matrix_scale {
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-        // Calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // num_rows
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // num_rhs
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] *= scale_;
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_;  // SoA
             }
         }
     }
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 8e42e8b41..8daeb8a26 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -15,8 +15,10 @@
 
 #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"      // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t
+#include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
 
@@ -27,25 +29,32 @@ namespace plssvm::kokkos::detail {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam USMEnabledMemorySpace the Kokkos::MemorySpace that may use USM allocations
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `standard_layout_tuple`
  */
-template <typename ExecutionSpace, kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, typename USMEnabledMemorySpace, target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
+    /**
+     * @brief The type of the used Kokkos::View that may use USM allocations.
+     */
+    template <typename T>
+    using usm_device_view_type = Kokkos::View<T *, USMEnabledMemorySpace>;  // possible USM allocations
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
-     * @param[out] kernel_matrix_d the calculated kernel matrix
-     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[out] kernel_matrix the calculated kernel matrix
+     * @param[in] data the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] q the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -55,12 +64,12 @@ class device_kernel_assembly {
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(device_view_type<real_type> kernel_matrix_d, device_view_type<real_type> data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, device_view_type<real_type> q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
-        kernel_matrix_d_{ kernel_matrix_d },
-        data_d_{ data_d },
+    device_kernel_assembly(usm_device_view_type<real_type> kernel_matrix, device_view_type<real_type> data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, device_view_type<real_type> q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
+        kernel_matrix_{ kernel_matrix },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         q_{ q },
         QA_cost_{ QA_cost },
@@ -77,81 +86,103 @@ class device_kernel_assembly {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
+
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-        // calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
-        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
-        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-
-        // create the shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-
-        // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // create two scratchpad memory arrays used for caching
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        auto *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_i_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_j_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+
+        // only calculate the upper triangular matrix -> can't use team.team_rank() since all threads in a team must progress further
         if (blockIdx_x >= blockIdx_y) {
             // create a thread private array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
-                // load data into shared memory
-                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                    const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
-                    data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
-                    data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
-                    data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
-                }
-                team.team_barrier();  // wait until all threads loaded their part of the data
-
-                // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i),
-                                                                                                    data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j));
+            {
+                // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+                const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+                // iterate over all features using blocking to be able to cache them for faster memory accesses
+                for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                    // load data into scratchpad memory
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // store the values in the scratchpad memory
+                        data_i_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                        data_j_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                    }
+                    team.team_barrier();  // wait until all threads loaded their part of the data
+
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                   data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                                            data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                }
+                            }
                         }
                     }
+                    team.team_barrier();  // wait until all threads performed their part of the calculations
                 }
-                team.team_barrier();  // wait until all threads performed their part of the calculations
             }
 
+            // calculate the indices used in the current thread
+            const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+            const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    // calculate the indices to access the kernel matrix (the part stored on the current device)
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
                         real_type temp_ij = temp[internal_i][internal_j];
-                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        // apply the final kernel function
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp_ij += cost_;
                         }
-                        // update the kernel matrix
-                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                        // update the upper triangular kernel matrix
+                        kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                     }
                 }
             }
@@ -160,11 +191,11 @@ class device_kernel_assembly {
 
   private:
     /// @cond Doxygen_suppress
-    device_view_type<real_type> kernel_matrix_d_;
-    device_view_type<const real_type> data_d_;
+    usm_device_view_type<real_type> kernel_matrix_;
+    device_view_type<const real_type> data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     device_view_type<const real_type> q_;
     const real_type QA_cost_;
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index b22f69885..ad1c6536a 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -15,8 +15,10 @@
 
 #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"      // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t
+#include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
 
@@ -27,26 +29,27 @@ namespace plssvm::kokkos::detail {
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <typename ExecutionSpace, kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, typename, target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
      * @param[in] alpha the scalar alpha value
      * @param[in] q the vector used in the dimensional reduction
-     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] data the data points to calculate the implicit kernel matrix from
      * @param[in] num_rows the total number of data points (= total number of rows)
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @param[in] cost the cost factor the diagonal is scaled with
@@ -58,13 +61,13 @@ class device_kernel_assembly_symm {
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly_symm(const real_type alpha, device_view_type<const real_type> q, device_view_type<const real_type> data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, device_view_type<const real_type> B, device_view_type<real_type> C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
+    device_kernel_assembly_symm(const real_type alpha, device_view_type<const real_type> q, device_view_type<const real_type> data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, device_view_type<const real_type> B, device_view_type<real_type> C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
         alpha_{ alpha },
         q_{ q },
-        data_d_{ data_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         QA_cost_{ QA_cost },
         cost_{ cost },
@@ -82,59 +85,81 @@ class device_kernel_assembly_symm {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
+
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
 
         // calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
-        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
-        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+        const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
-        // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
+        // get the scratchpad memory pointer for later usage
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+
+        // only calculate the upper triangular matrix -> can't use team.team_rank() since all threads in a team must progress further
         if (blockIdx_x >= blockIdx_y) {
             // create a thread private array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-            // create the shared memory arrays used for caching data point features
-            constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-            real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
-
+            //*************************************************************************//
+            //                   inplace kernel matrix construction                    //
+            //*************************************************************************//
             {
-                // create the shared memory arrays used for caching data point features
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+                // reinterpret the scratchpad memory to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_i_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_j_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
                 // iterate over all features using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
-                    // load data into shared memory
+                for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                    // load data into scratchpad memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                        const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                        data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
-                        data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
-                        data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
-                        data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // store the values in the scratchpad memory
+                        data_i_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                        data_j_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                     }
                     team.team_barrier();  // wait until all threads loaded their part of the data
 
-                    // perform the feature reduction calculation
-                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i),
-                                                                                                        data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                   data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                                            data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                }
                             }
                         }
                     }
@@ -145,16 +170,18 @@ class device_kernel_assembly_symm {
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if ((device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j)) {
-                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
+                        // apply the final kernel function
+                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp[internal_i][internal_j] += cost_;
                         }
                     } else {
@@ -164,42 +191,44 @@ class device_kernel_assembly_symm {
                 }
             }
 
-            // calculate C += alpha * temp * B for the UPPER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+            //*************************************************************************//
             {
-                // same shared memory size but with different dimensions
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE, FEATURE_BLOCK_SIZE };
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> C_out_cache{ data_cache_ptr + shmem_size, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE, FEATURE_BLOCK_SIZE };
+                // reinterpret the scratchpad memory to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ scratchpad_ptr, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz, THREAD_BLOCK_SIZE_uz };
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> C_out_cache{ scratchpad_ptr + scratchpad_size, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz, THREAD_BLOCK_SIZE_uz };
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) {
-                    // load data into shared memory
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
+                    // load data into scratchpad memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                        B_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y) = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y];
-                        B_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y + THREAD_BLOCK_SIZE) = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz];
-                        C_out_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y) = real_type{ 0.0 };
-                        C_out_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y + THREAD_BLOCK_SIZE) = real_type{ 0.0 };
+                        // store the values in the scratchpad memory
+                        B_cache(internal * THREAD_BLOCK_SIZE + team_rank_x, team_rank_y) = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
+                        C_out_cache(internal * THREAD_BLOCK_SIZE + team_rank_x, team_rank_y) = real_type{ 0.0 };                                                                             // SoA
                     }
                     team.team_barrier();  // wait until all threads loaded their part of the data
 
-                    // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    // calculate intermediate results and store them in scratchpad memory
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j, (class_idx + threadIdx_x) % FEATURE_BLOCK_SIZE) +=
-                                    temp[internal_i][internal_j] * B_cache(threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i, (class_idx + threadIdx_x) % FEATURE_BLOCK_SIZE);
+                                C_out_cache(team_rank_y * INTERNAL_BLOCK_SIZE + internal_j, (class_idx + team_rank_x) % THREAD_BLOCK_SIZE) +=
+                                    temp[internal_i][internal_j] * B_cache(team_rank_x * INTERNAL_BLOCK_SIZE + internal_i, (class_idx + team_rank_x) % THREAD_BLOCK_SIZE);
                             }
                         }
                         team.team_barrier();  // wait until all threads performed their part of the calculations
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add intermediate cached results to the C matrix
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal);
-                        Kokkos::atomic_add(&C_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_x], C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal, threadIdx_x));
-                        Kokkos::atomic_add(&C_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_sz], C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal, threadIdx_x + THREAD_BLOCK_SIZE));
+                        // calculate the indices to access the global data
+                        const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal);
+
+                        Kokkos::atomic_add(&C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache(team_rank_y * INTERNAL_BLOCK_SIZE + internal, team_rank_x));  // SoA
                     }
                     team.team_barrier();  // wai until all threads updated C with their values
                 }
@@ -208,51 +237,54 @@ class device_kernel_assembly_symm {
             // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
 
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp[internal_i][internal_j] = real_type{ 0.0 };
                     }
                 }
             }
 
-            // calculate C += alpha * temp * B for the LOWER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+            //*************************************************************************//
             {
-                // same shared memory size but with different dimensions
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> C_out_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+                // reinterpret the scratchpad memory to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> C_out_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) {
-                    // load data into shared memory
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
+                    // load data into scratchpad memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                        B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y];
-                        B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz];
-                        C_out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
-                        C_out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
+                        // store the values in the scratchpad memory
+                        B_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
+                        C_out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = real_type{ 0.0 };
                     }
                     team.team_barrier();  // wait until all threads loaded their part of the data
 
-                    // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    // calculate intermediate results and store them in scratchpad memory
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                C_out_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, internal_i * THREAD_BLOCK_SIZE + threadIdx_x) +=
-                                    temp[internal_i][internal_j] * B_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j);
+                                C_out_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, internal_i * THREAD_BLOCK_SIZE + team_rank_x) +=
+                                    temp[internal_i][internal_j] * B_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j);
                             }
                         }
                         team.team_barrier();  // wait until all threads performed their part of the calculations
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add the intermediate cached results to the C matrix
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal);
-                        Kokkos::atomic_add(&C_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y], C_out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x));
-                        Kokkos::atomic_add(&C_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz], C_out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x));
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal);
+
+                        Kokkos::atomic_add(&C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x));  // SoA
                     }
                     team.team_barrier();  // wait until all threads updated C with their values
                 }
@@ -264,10 +296,10 @@ class device_kernel_assembly_symm {
     /// @cond Doxygen_suppress
     const real_type alpha_;
     device_view_type<const real_type> q_;
-    device_view_type<const real_type> data_d_;
+    device_view_type<const real_type> data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type QA_cost_;
     const real_type cost_;
diff --git a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
index a2859a294..652aaa25c 100644
--- a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
@@ -45,42 +45,17 @@ template <typename T>
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] KOKKOS_INLINE_FUNCTION real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
index 767bfc958..51e67a89e 100644
--- a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
@@ -13,9 +13,11 @@
 #define PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
-#include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
+#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"   // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"  // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t
+#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                     // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                          // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
 
@@ -24,39 +26,40 @@
 namespace plssvm::kokkos::detail {
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  */
-template <typename ExecutionSpace>
+template <typename ExecutionSpace, typename, target_platform target>
 class device_kernel_w_linear {
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
-     * @param[in,out] w_d the vector to speedup the linear prediction
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] sv_d the support vectors
+     * @param[in,out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
-     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
-     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
-    device_kernel_w_linear(device_view_type<real_type> w_d, device_view_type<const real_type> alpha_d, device_view_type<const real_type> sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
-        w_d_{ w_d },
-        alpha_d_{ alpha_d },
-        sv_d_{ sv_d },
+    device_kernel_w_linear(device_view_type<real_type> w, device_view_type<const real_type> alpha, device_view_type<const real_type> support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+        w_{ w },
+        alpha_{ alpha },
+        support_vectors_{ support_vectors },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
-        device_specific_num_sv_{ device_specific_num_sv },
-        sv_offset_{ sv_offset },
+        device_num_sv_{ device_num_sv },
+        device_sv_offset_{ device_sv_offset },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
         grid_size_x_{ grid_size_x } { }
@@ -67,75 +70,100 @@ class device_kernel_w_linear {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
-        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-        // calculate the indices used in the current thread
-        const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;
-        const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;
-        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
 
-        // create the shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_feature{ data_cache_ptr, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_alpha{ data_cache_ptr + shmem_size, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // create two scratchpad memory arrays used for caching
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> feature_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> alpha_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE_sz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_feature_idx = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_features
+            const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;    // num_classes
 
-                data_cache_feature(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_sz) + sv + threadIdx_y];  // SoA
-                data_cache_alpha(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_sz) + sv + sv_offset_ + threadIdx_y];      // AoS
-            }
-            team.team_barrier();  // wait until all threads loaded their part of the data
+            // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+            for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                        temp[internal_feature][internal_class] += data_cache_alpha(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_class) * data_cache_feature(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_feature);
+                    // store the values in the scratchpad memory
+                    feature_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_y];  // SoA
+                    alpha_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_y];   // AoS
+                }
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the sv is the fastest moving index
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            real_type sum{ 0.0 };
+                            for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                                sum += alpha_cache(sv, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * feature_cache(sv, team_rank_x * INTERNAL_BLOCK_SIZE + internal_feature);
+                            }
+                            temp[internal_feature][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the sv is the slowest moving index
+                    for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp[internal_feature][internal_class] += alpha_cache(sv, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * feature_cache(sv, team_rank_x * INTERNAL_BLOCK_SIZE + internal_feature);
+                            }
+                        }
                     }
                 }
+                team.team_barrier();  // wait until all threads performed their part of the calculations
             }
-            team.team_barrier();  // wait until all threads performed their part of the calculations
         }
 
-        // update global array with local one
+        // calculate the indices used in the current thread
+        const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+        const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
+
+        // update the global w-vector with the locally cached values
         for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                // calculate the indices to access the global data
                 const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
                 const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_sz) + global_class_idx] = temp[internal_feature][internal_class];
+                w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
             }
         }
     }
 
   private:
     /// @cond Doxygen_suppress
-    device_view_type<real_type> w_d_;
-    device_view_type<const real_type> alpha_d_;
-    device_view_type<const real_type> sv_d_;
+    device_view_type<real_type> w_;
+    device_view_type<const real_type> alpha_;
+    device_view_type<const real_type> support_vectors_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
-    const std::size_t device_specific_num_sv_;
-    const std::size_t sv_offset_;
+    const std::size_t device_num_sv_;
+    const std::size_t device_sv_offset_;
     const std::size_t grid_x_offset_;
     const std::size_t grid_y_offset_;
     const std::size_t grid_size_x_;
@@ -143,24 +171,25 @@ class device_kernel_w_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  */
-template <typename ExecutionSpace>
+template <typename ExecutionSpace, typename, target_platform target>
 class device_kernel_predict_linear {
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
-     * @param[out] prediction_d the predicted values
-     * @param[in] w_d the vector to speedup the calculations
-     * @param[in] rho_d the previously learned bias
-     * @param[in] predict_points_d the data points to predict
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_predict_points the number of data points to predict
      * @param[in] num_features the number of features per data point
@@ -168,11 +197,11 @@ class device_kernel_predict_linear {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
-    device_kernel_predict_linear(device_view_type<real_type> prediction_d, device_view_type<const real_type> w_d, device_view_type<const real_type> rho_d, device_view_type<const real_type> predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
-        prediction_d_{ prediction_d },
-        w_d_{ w_d },
-        rho_d_{ rho_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict_linear(device_view_type<real_type> prediction, device_view_type<const real_type> w, device_view_type<const real_type> rho, device_view_type<const real_type> predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+        prediction_{ prediction },
+        w_{ w },
+        rho_{ rho },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
@@ -186,76 +215,97 @@ class device_kernel_predict_linear {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
-        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
 
-        // calculate the indices used in the current thread
-        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;
-        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;
-        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-
-        // create the shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_pp{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_w{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // create two scratchpad memory arrays used for caching
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> pp_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> w_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_pp(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
-                data_cache_pp(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
-                data_cache_w(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = w_d_[(dim + threadIdx_y) * (num_classes_ + PADDING_SIZE_sz) + global_class_idx];
-                data_cache_w(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = w_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_classes_ + PADDING_SIZE_sz) + global_class_idx];
-            }
-            team.team_barrier();  // wait until all threads loaded their part of the data
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;     // num_predict_points
+            const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_classes
 
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                        temp[internal_pd][internal_class] += data_cache_w(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_class) * data_cache_pp(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_pd);
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the scratchpad memory
+                    pp_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = predict_points_[(feature_block + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                    w_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = w_[(feature_block + threadIdx_y) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+                }
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            real_type sum{ 0.0 };
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                sum += w_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp);
+                            }
+                            temp[internal_pp][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the feature is the slowest moving index
+                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp[internal_pp][internal_class] += w_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp);
+                            }
+                        }
                     }
                 }
+                team.team_barrier();  // wait until all threads performed their part of the calculations
             }
-            team.team_barrier();  // wait until all threads performed their part of the calculations
         }
 
-        // update global array with local one
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        // calculate the indices used in the current thread
+        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+        const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+        // update the global array with the local one
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
+                // calculate the indices to access the global data
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
                 const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx];
+                prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx];  // AoS
             }
         }
     }
 
   private:
     /// @cond Doxygen_suppress
-    device_view_type<real_type> prediction_d_;
-    device_view_type<const real_type> w_d_;
-    device_view_type<const real_type> rho_d_;
-    device_view_type<const real_type> predict_points_d_;
+    device_view_type<real_type> prediction_;
+    device_view_type<const real_type> w_;
+    device_view_type<const real_type> rho_;
+    device_view_type<const real_type> predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_predict_points_;
     const std::size_t num_features_;
@@ -266,27 +316,28 @@ class device_kernel_predict_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <typename ExecutionSpace, kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, typename, target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in] prediction_d the predicted values
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] rho_d the previously learned biases
-     * @param[in] sv_d the support vectors
-     * @param[in] predict_points_d the data points to predict
+     * @param[in] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned biases
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
      * @param[in] num_predict_points the number of data points to predict
@@ -296,12 +347,12 @@ class device_kernel_predict {
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_predict(device_view_type<real_type> prediction_d, device_view_type<const real_type> alpha_d, device_view_type<const real_type> rho_d, device_view_type<const real_type> sv_d, device_view_type<const real_type> predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
-        prediction_d_{ prediction_d },
-        alpha_d_{ alpha_d },
-        rho_d_{ rho_d },
-        sv_d_{ sv_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict(device_view_type<real_type> prediction, device_view_type<const real_type> alpha, device_view_type<const real_type> rho, device_view_type<const real_type> support_vectors, device_view_type<const real_type> predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
+        prediction_{ prediction },
+        alpha_{ alpha },
+        rho_{ rho },
+        support_vectors_{ support_vectors },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
         num_predict_points_{ num_predict_points },
@@ -317,55 +368,72 @@ class device_kernel_predict {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
+
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current thread
-        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;
-        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
 
-        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
+        // get the scratchpad memory pointer for later usage
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
 
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
         {
-            // create the shared memory arrays used for caching data point features
-            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_pp{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_sv{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+            // reinterpret the scratchpad memory to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> pp_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> sv_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_predict_points
+            const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
-                // load data into shared memory
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
-                    const auto global_sv_idx = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    data_cache_pp(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
-                    data_cache_pp(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
-                    data_cache_sv(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[(dim + threadIdx_y) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
-                    data_cache_sv(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the scratchpad memory
+                    pp_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = predict_points_[(feature_block + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                    sv_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = support_vectors_[(feature_block + threadIdx_y) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];             // SoA
                 }
                 team.team_barrier();  // wait until all threads loaded their part of the data
 
-                // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the feature reduction calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_sv),
-                                                                                                      data_cache_pp(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_pd));
+                            real_type sum{ 0.0 };
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                sum += detail::feature_reduce<kernel_function>(sv_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_sv),
+                                                                               pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp));
+                            }
+                            temp[internal_pp][internal_sv] += sum;
+                        }
+                    }
+                } else {
+                    // perform the feature reduction calculation, the feature is the slowest moving index
+                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_sv),
+                                                                                                          pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp));
+                            }
                         }
                     }
                 }
@@ -374,55 +442,57 @@ class device_kernel_predict {
         }
 
         // update temp using the respective kernel function
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter_);
+                temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter_);
             }
         }
 
         {
-            // create the shared memory arrays used for caching data point features
-            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> alpha_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> out_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) {
-                // load data into shared memory
+            // reinterpret the scratchpad memory to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> alpha_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> out_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+
+            // calculate the indices used in the current thread
+            const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+            // iterate over all classes using blocking to be able to cache them for faster memory accesses
+            for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const std::size_t global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    alpha_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
-                    alpha_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                    // store the values in the scratchpad memory
+                    alpha_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = alpha_[(class_block + threadIdx_y) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                     // the bias (rho) must only be applied once for all support vectors
                     if (blockIdx_y == std::size_t{ 0 }) {
-                        out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y];
-                        out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y + THREAD_BLOCK_SIZE_sz];
+                        out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = -rho_[class_block + threadIdx_y];
                     } else {
-                        out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
-                        out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
+                        out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = real_type{ 0.0 };
                     }
                 }
                 team.team_barrier();  // wait until all threads loaded their part of the data
 
-                // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
-                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                // calculate intermediate results and store them in scratchpad memory
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            out_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, internal_pd * THREAD_BLOCK_SIZE + threadIdx_x) +=
-                                temp[internal_pd][internal_sv] * alpha_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_sv);
+                            out_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, internal_pp * THREAD_BLOCK_SIZE + team_rank_x) +=
+                                temp[internal_pp][internal_sv] * alpha_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, team_rank_y * INTERNAL_BLOCK_SIZE + internal_sv);
                         }
                     }
                     team.team_barrier();  // wait until all threads performed their part of the calculations
                 }
 
-                // add intermediate cached results to prediction_d
+                // atomically add the intermediate cached results to the prediction
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data
                     const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                    Kokkos::atomic_add(&prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y], out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x));
-                    Kokkos::atomic_add(&prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz], out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x));
+                    Kokkos::atomic_add(&prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x));
                 }
                 team.team_barrier();  // wait until all threads updated their part of the prediction
             }
@@ -431,11 +501,11 @@ class device_kernel_predict {
 
   private:
     /// @cond Doxygen_suppress
-    device_view_type<real_type> prediction_d_;
-    device_view_type<const real_type> alpha_d_;
-    device_view_type<const real_type> rho_d_;
-    device_view_type<const real_type> sv_d_;
-    device_view_type<const real_type> predict_points_d_;
+    device_view_type<real_type> prediction_;
+    device_view_type<const real_type> alpha_;
+    device_view_type<const real_type> rho_;
+    device_view_type<const real_type> support_vectors_;
+    device_view_type<const real_type> predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
     const std::size_t num_predict_points_;
diff --git a/include/plssvm/backends/Kokkos/memory_space.hpp b/include/plssvm/backends/Kokkos/memory_space.hpp
new file mode 100644
index 000000000..eba6e1674
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/memory_space.hpp
@@ -0,0 +1,77 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Memory space enumeration for the MemorySpaces in Kokkos.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_HPP_
+#pragma once
+
+#include "fmt/base.h"     // fmt::formatter
+#include "fmt/ostream.h"  // fmt::ostream_formatter
+
+#include <iosfwd>  // std::ostream forward declaration
+#include <vector>  // std::vector
+
+namespace plssvm::kokkos {
+
+/**
+ * @brief Enum class for all memory spaces supported by [Kokkos](https://github.com/kokkos/kokkos).
+ */
+enum class memory_space {
+    /** Memory space representing traditional memory accessible from the CPU. */
+    host_space,
+    /** Memory space representing memory on a CUDA-capable GPU. */
+    cuda_space,
+    /** Memory space representing unified virtual memory on a CUDA-capable GPU system. */
+    cuda_usm_space,
+    /** Memory space representing memory in the HIP GPU programming environment. */
+    hip_space,
+    /** Memory space representing page-migrating memory in the HIP GPU programming environment. */
+    hip_usm_space,
+    /** Memory space representing device memory in the SYCL GPU programming environment. */
+    sycl_space,
+    /** Memory space representing page-migrating memory in the SYCL GPU programming environment */
+    sycl_usm_space
+};
+
+/**
+ * @brief Output the memory @p space to the given output-stream @p out.
+ * @param[in,out] out the output-stream to write the memory space to
+ * @param[in] space the Kokkos memory space
+ * @return the output-stream
+ */
+std::ostream &operator<<(std::ostream &out, memory_space space);
+
+/**
+ * @brief Use the input-stream @p in to initialize the memory @p space.
+ * @param[in,out] in input-stream to extract the memory space from
+ * @param[in] space the Kokkos memory space
+ * @return the input-stream
+ */
+std::istream &operator>>(std::istream &in, memory_space &space);
+
+/**
+ * @brief List all available Kokkos::MemorySpaces.
+ * @details Only Kokkos::MemorySpaces that where enabled during the CMake configuration are available.
+ *          The `memory_space::host_space` is always included.
+ * @return the available Kokkos::MemorySpaces (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::vector<memory_space> list_available_memory_spaces();
+
+}  // namespace plssvm::kokkos
+
+/// @cond
+
+template <>
+struct fmt::formatter<plssvm::kokkos::memory_space> : fmt::ostream_formatter { };
+
+/// @endcond
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_HPP_
diff --git a/include/plssvm/backends/Kokkos/memory_space_type_traits.hpp b/include/plssvm/backends/Kokkos/memory_space_type_traits.hpp
new file mode 100644
index 000000000..a25a846ab
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/memory_space_type_traits.hpp
@@ -0,0 +1,265 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Memory space type traits for the MemorySpaces in Kokkos.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_TYPE_TRAITS_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_TYPE_TRAITS_HPP_
+#include <decl/Kokkos_Declare_OPENMP.hpp>
+#pragma once
+
+#include "plssvm/backends/Kokkos/execution_space.hpp"              // plssvm::kokkos::execution_space
+#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp"  // plssvm::kokkos::kokkos_type_to_execution_space_v
+#include "plssvm/backends/Kokkos/memory_space.hpp"                 // plssvm::kokkos::memory_space
+
+#include "Kokkos_Core.hpp"  // Kokkos macros, Kokkos MemorySpace types
+
+namespace plssvm::kokkos {
+
+//***************************************************//
+//            memory_space_to_kokkos_type            //
+//***************************************************//
+
+/**
+ * @brief Uninstantiated base type to convert a `memory_space` enum value to a Kokkos::MemorySpace type.
+ */
+template <memory_space>
+struct memory_space_to_kokkos_type;
+
+/**
+ * @brief Convert a `memory_space::host_space` enum value to a `Kokkos::HostSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::host_space> {
+    using type = Kokkos::HostSpace;
+};
+
+#if defined(KOKKOS_ENABLE_CUDA)
+/**
+ * @brief Convert a `memory_space::cuda_space` enum value to a `Kokkos::CudaSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::cuda_space> {
+    using type = Kokkos::CudaSpace;
+};
+
+/**
+ * @brief Convert a `memory_space::cuda_usm_space` enum value to a `Kokkos::CudaUVMSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::cuda_usm_space> {
+    using type = Kokkos::CudaUVMSpace;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+/**
+ * @brief Convert a `memory_space::hip_space` enum value to a `Kokkos::HIPSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::hip_space> {
+    using type = Kokkos::HIPSpace;
+};
+
+/**
+ * @brief Convert a `memory_space::hip_usm_space` enum value to a `Kokkos::HIPManagedSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::hip_usm_space> {
+    using type = Kokkos::HIPManagedSpace;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+/**
+ * @brief Convert a `memory_space::sycl_space` enum value to a `Kokkos::SYCLDeviceUSMSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::sycl_space> {
+    using type = Kokkos::SYCLDeviceUSMSpace;
+};
+
+/**
+ * @brief Convert a `memory_space::sycl_usm_space` enum value to a `Kokkos::SYCLSharedUSMSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::sycl_usm_space> {
+    using type = Kokkos::SYCLSharedUSMSpace;
+};
+#endif
+
+/**
+ * @brief Convert the `memory_space` @p space to the corresponding Kokkos::MemorySpace type.
+ * @tparam space the enum value to convert
+ */
+template <memory_space space>
+using memory_space_to_kokkos_type_t = typename memory_space_to_kokkos_type<space>::type;
+
+//***************************************************//
+//            kokkos_type_to_memory_space            //
+//***************************************************//
+
+/**
+ * @brief Uninstantiated base type to convert a Kokkos::MemorySpace type to a `memory_space` enum value.
+ */
+template <typename>
+struct kokkos_type_to_memory_space;
+
+/**
+ * @brief Convert a `Kokkos::HostSpace` Kokkos::MemorySpace type to a `memory_space::host_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::HostSpace> {
+    constexpr static memory_space value = memory_space::host_space;
+};
+
+#if defined(KOKKOS_ENABLE_CUDA)
+/**
+ * @brief Convert a `Kokkos::CudaSpace` Kokkos::MemorySpace type to a `memory_space::cuda_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::CudaSpace> {
+    constexpr static memory_space value = memory_space::cuda_space;
+};
+
+/**
+ * @brief Convert a `Kokkos::CudaUVMSpace` Kokkos::MemorySpace type to a `memory_space::cuda_usm_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::CudaUVMSpace> {
+    constexpr static memory_space value = memory_space::cuda_usm_space;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+/**
+ * @brief Convert a `Kokkos::HIPSpace` Kokkos::MemorySpace type to a `memory_space::hip_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::HIPSpace> {
+    constexpr static memory_space value = memory_space::hip_space;
+};
+
+/**
+ * @brief Convert a `Kokkos::HIPManagedSpace` Kokkos::MemorySpace type to a `memory_space::hip_usm_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::HIPManagedSpace> {
+    constexpr static memory_space value = memory_space::hip_usm_space;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+/**
+ * @brief Convert a `Kokkos::SYCLDeviceUSMSpace` Kokkos::MemorySpace type to a `memory_space::sycl_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::SYCLDeviceUSMSpace> {
+    constexpr static memory_space value = memory_space::sycl_space;
+};
+
+/**
+ * @brief Convert a `Kokkos::SYCLSharedUSMSpace` Kokkos::MemorySpace type to a `memory_space::sycl_usm_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::SYCLSharedUSMSpace> {
+    constexpr static memory_space value = memory_space::sycl_usm_space;
+};
+#endif
+
+/**
+ * @brief Convert the Kokkos::MemorySpace type @p MemorySpace to the corresponding `memory_space` enum value.
+ * @tparam MemorySpace the Kokkos::MemorySpace type to convert
+ */
+template <typename MemorySpace>
+inline constexpr memory_space kokkos_type_to_memory_space_v = kokkos_type_to_memory_space<MemorySpace>::value;
+
+//***************************************************//
+//          execution_space_to_memory_space          //
+//***************************************************//
+
+/**
+ * @brief Convert a host `execution_space` enum value to a `memory_space::host_space` enum value.
+ */
+template <execution_space, bool UseUSM>
+struct execution_space_to_memory_space {
+    constexpr static memory_space value = memory_space::host_space;
+};
+
+/**
+ * @brief Convert an `execution_space::cuda` that does not use USM allocations enum value to a `memory_space::cuda_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::cuda, false> {
+    constexpr static memory_space value = memory_space::cuda_space;
+};
+
+/**
+ * @brief Convert an `execution_space::cuda` that does use USM allocations enum value to a `memory_space::cuda_usm_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::cuda, true> {
+    constexpr static memory_space value = memory_space::cuda_usm_space;
+};
+
+/**
+ * @brief Convert an `execution_space::hip` that does not use USM allocations enum value to a `memory_space::hip_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::hip, false> {
+    constexpr static memory_space value = memory_space::hip_space;
+};
+
+/**
+ * @brief Convert an `execution_space::hip` that does use USM allocations enum value to a `memory_space::hip_usm_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::hip, true> {
+    constexpr static memory_space value = memory_space::hip_usm_space;
+};
+
+/**
+ * @brief Convert an `execution_space::sycl` that does not use USM allocations enum value to a `memory_space::sycl_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::sycl, false> {
+    constexpr static memory_space value = memory_space::sycl_space;
+};
+
+/**
+ * @brief Convert an `execution_space::sycl` that does use USM allocations enum value to a `memory_space::sycl_usm_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::sycl, true> {
+    constexpr static memory_space value = memory_space::sycl_usm_space;
+};
+
+/**
+ * @brief Convert the `execution_space` enum value @p space together with the @p UseUSM flag indication whether USM allocation should be used to the corresponding `memory_space` enum value.
+ * @tparam space the `execution_space` enum value to convert
+ * @tparam UseUSM `true` if USM allocations should be used
+ */
+template <execution_space space, bool UseUSM = false>
+inline constexpr memory_space execution_space_to_memory_space_v = execution_space_to_memory_space<space, UseUSM>::value;
+
+//***************************************************//
+//   kokkos_execution_space_to_kokkos_memory_space   //
+//***************************************************//
+
+/**
+ * @brief Convert the Kokkos::ExecutionSpace type together with the @p UseUSM flag indication whether USM allocation should be used to the corresponding Kokkos::MemorySpace type.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace type
+ * @tparam UseUSM `true` if USM allocations should be used
+ */
+template <typename ExecutionSpace, bool UseUSM = false>
+using kokkos_execution_space_to_kokkos_memory_space_t = memory_space_to_kokkos_type_t<execution_space_to_memory_space_v<kokkos_type_to_execution_space_v<ExecutionSpace>, UseUSM>>;
+
+}  // namespace plssvm::kokkos
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_TYPE_TRAITS_HPP_
diff --git a/include/plssvm/backends/OpenCL/csvm.hpp b/include/plssvm/backends/OpenCL/csvm.hpp
index f52ec29cd..1ac955019 100644
--- a/include/plssvm/backends/OpenCL/csvm.hpp
+++ b/include/plssvm/backends/OpenCL/csvm.hpp
@@ -112,7 +112,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::comma
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp
index 348faf5f6..ab7ee5f4e 100644
--- a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp
+++ b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp
@@ -20,6 +20,7 @@
 #include "CL/cl.h"  // cl_mem
 
 #include <cstddef>  // std::size_t
+#include <variant>  // std::variant
 
 namespace plssvm::opencl::detail {
 
@@ -28,13 +29,14 @@ namespace plssvm::opencl::detail {
  * @tparam T the type of the kernel pointer to wrap
  */
 template <typename T>
-class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queue *, cl_mem, device_ptr<T>> {
+class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queue *, std::variant<cl_mem, T*>, device_ptr<T>> {
     /// The template base type of the OpenCL device_ptr class.
-    using base_type = ::plssvm::detail::gpu_device_ptr<T, const command_queue *, cl_mem, device_ptr<T>>;
+    using base_type = ::plssvm::detail::gpu_device_ptr<T, const command_queue *, std::variant<cl_mem, T*>, device_ptr<T>>;
 
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -60,21 +62,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queu
      * @brief Allocates `size * sizeof(T)` bytes on the device associated with @p queue.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] queue the associated command queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(size_type size, const command_queue &queue);
+    device_ptr(size_type size, const command_queue &queue, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device associated with @p queue.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] queue the associated command queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, const command_queue &queue);
+    device_ptr(plssvm::shape shape, const command_queue &queue, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device associated with @p queue.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] queue the associated command queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const command_queue &queue);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const command_queue &queue, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/OpenCL/detail/kernel.hpp b/include/plssvm/backends/OpenCL/detail/kernel.hpp
index 2a54c5769..600af7bfa 100644
--- a/include/plssvm/backends/OpenCL/detail/kernel.hpp
+++ b/include/plssvm/backends/OpenCL/detail/kernel.hpp
@@ -22,6 +22,14 @@ namespace plssvm::opencl::detail {
  * @details Used to distinguish kernels in the plssvm::opencl::detail::command_queue class.
  */
 enum class compute_kernel_name {
+    /// The kernel to fill a float device pointer with a provided value.
+    fill_kernel_float,
+    /// The kernel to fill a double device pointer with a provided value.
+    fill_kernel_double,
+    /// The kernel to memset a float device pointer with a provided pattern.
+    memset_kernel_float,
+    /// The kernel to memset a double device pointer with a provided pattern.
+    memset_kernel_double,
     /// The kernels to explicitly assemble the kernel matrix.
     assemble_kernel_matrix_explicit,
     /// The kernel performing a explicit BLAS SYMM calculation.
diff --git a/include/plssvm/backends/OpenCL/detail/utility.hpp b/include/plssvm/backends/OpenCL/detail/utility.hpp
index f2d30947c..09914ee16 100644
--- a/include/plssvm/backends/OpenCL/detail/utility.hpp
+++ b/include/plssvm/backends/OpenCL/detail/utility.hpp
@@ -21,11 +21,13 @@
 #include "plssvm/backends/OpenCL/detail/kernel.hpp"         // plssvm::opencl::detail::compute_kernel_name
 #include "plssvm/backends/OpenCL/exceptions.hpp"            // plssvm::opencl::backend_exception
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
+#include "plssvm/detail/type_list.hpp"                      // plssvm::detail::{remove_cvref_t, is_variant_v}
+#include "plssvm/detail/utility.hpp"                        // plssvm::detail::visit_overload
 #include "plssvm/kernel_function_types.hpp"                 // plssvm::kernel_function_type
 #include "plssvm/mpi/communicator.hpp"                      // plssvm::mpi::communicator
 #include "plssvm/target_platforms.hpp"                      // plssvm::target_platform
 
-#include "CL/cl.h"  // cl_uint, cl_int, clSetKernelArg, clEnqueueNDRangeKernel, clFinish
+#include "CL/cl.h"  // cl_uint, cl_int, clSetKernelArg, clSetKernelArgSVMPointer, clEnqueueNDRangeKernel, clFinish
 
 #include "fmt/format.h"  // fmt::format
 
@@ -33,6 +35,7 @@
 #include <string>       // std::string
 #include <string_view>  // std::string_view
 #include <utility>      // std::forward, std::pair
+#include <variant>      // std::variant, std::visit
 #include <vector>       // std::vector
 
 /**
@@ -126,11 +129,12 @@ void device_synchronize(const command_queue &queue);
  *
  * @param[in] comm the MPI communicator
  * @param[in] contexts the used OpenCL contexts
+ * @param[in] target the target platform to create the kernel binaries for
  * @param[in] kernel_function the kernel function
  * @throws plssvm::invalid_file_format_exception if the file couldn't be read using [`std::ifstream::read`](https://en.cppreference.com/w/cpp/io/basic_istream/read)
  * @return [the command queues with all necessary kernels; information regarding the JIT compilation] (`[[nodiscard]]`)
  */
-[[nodiscard]] std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi::communicator &comm, const std::vector<context> &contexts, kernel_function_type kernel_function);
+[[nodiscard]] std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi::communicator &comm, const std::vector<context> &contexts, target_platform target, kernel_function_type kernel_function);
 
 /**
  * @brief Set all arguments in the parameter pack @p args for the kernel @p kernel.
@@ -143,7 +147,17 @@ inline void set_kernel_args(cl_kernel kernel, Args... args) {
     cl_uint i = 0;
     // iterate over parameter pack and set OpenCL kernel
     ([&](auto &arg) {
-        const error_code ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg);
+        error_code ec{};
+        // check if we have to set a variant value
+        if constexpr (::plssvm::detail::is_variant_v<::plssvm::detail::remove_cvref_t<decltype(arg)>>) {
+            std::visit(::plssvm::detail::visit_overload{
+                           [&](cl_mem &kernel_arg) { ec = clSetKernelArg(kernel, i++, sizeof(decltype(kernel_arg)), &kernel_arg); },
+                           [&](auto &kernel_arg) { ec = clSetKernelArgSVMPointer(kernel, i++, kernel_arg); } },
+                       arg);
+        } else {
+            // set kernel argument normally
+            ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg);
+        }
         PLSSVM_OPENCL_ERROR_CHECK(ec, fmt::format("error setting OpenCL kernel argument {}", i - 1))
     }(args),
      ...);
diff --git a/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl b/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl
index 0f1ac247a..8b474df77 100644
--- a/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl
+++ b/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl
@@ -16,8 +16,8 @@
  * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for!
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -26,78 +26,90 @@
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, const ulong device_specific_num_rows, const ulong row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) {
+__kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, const ulong device_num_rows, const ulong device_row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-    // calculate the indices used in the current work-item
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;  // #rhs
-    const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;  // # row
-    const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    // create two local memory arrays used for caching
+    __local real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
-    // create a thread private array used for internal caching
+    // create a work-item private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (ulong dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ul) {
-        // load data into local memory
-        for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const ulong global_i = i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-            const ulong global_j = j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-            // determine on which side of the diagonal we are located
-            if (dim + get_local_id(1) < global_j) {
-                A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ul) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + (ulong) 1) / (ulong) 2];
-            } else {
-                A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ul) + dim + threadIdx_y - global_j * (global_j + (ulong) 1) / (ulong) 2];
-            }
-            // determine on which side of the diagonal we are located
-            if (dim + get_local_id(1) + THREAD_BLOCK_SIZE < global_j) {
-                A_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows - row_offset + PADDING_SIZE_ul) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ul + (ulong) 1) / (ulong) 2];
-            } else {
-                A_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul - global_j * (global_j + (ulong) 1) / (ulong) 2];
-            }
+    {
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
-            B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ul) + global_i];
-            B_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rhs + PADDING_SIZE_ul) + global_i];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+        // iterate over all values using blocking to be able to cache them for faster memory accesses
+        for (ulong dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into local memory
+            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_i_idx_linear = i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                const ulong global_j_idx_linear = j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the local memory
+                // determine on which side of the diagonal we are located
+                if (dim_block + get_local_id(1) < global_j_idx_linear) {
+                    A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + (ulong) 1) / (ulong) 2];  // SoA, upper triangular matrix only
+                } else {
+                    A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + (ulong) 1) / (ulong) 2];  // SoA, upper triangular matrix only
+                }
+                B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-        // perform the dot product calculation
-        for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+            // perform the dot product calculation, the dim is the fastest moving index
             for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    real_type sum = 0.0;
+                    for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        sum += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
+                    temp[internal_i][internal_j] += sum;
+                }
+            }
+#else
+            // perform the dot product calculation, the dim is the slowest moving index
+            for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
                 }
             }
+#endif
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
     }
 
+    // calculate the indices used in the current work-item
+    const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
     // apply the (partial) BLAS operation and update C
     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const ulong global_i = i + (ulong) internal_i;
-            const ulong device_global_j = j + (ulong) internal_j;
-            const ulong global_j = row_offset + j + (ulong) internal_j;
+            // calculate the indices to access the global data and the data with respect to the current device
+            const ulong global_i_idx = i_idx + (ulong) internal_i;
+            const ulong device_global_j_idx = j_idx + (ulong) internal_j;
+            const ulong global_j_idx = device_row_offset + device_global_j_idx;
 
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && device_global_j < device_specific_num_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i];
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -109,8 +121,8 @@ __kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, cons
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
  * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -119,68 +131,85 @@ __kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, cons
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__kernel void device_kernel_symm_mirror(const ulong num_rows, const ulong num_rhs, const ulong num_mirror_rows, const ulong device_specific_num_rows, const ulong row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) {
+__kernel void device_kernel_symm_mirror(const ulong num_rows, const ulong num_rhs, const ulong num_mirror_rows, const ulong device_num_rows, const ulong device_row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-    // calculate the indices used in the current work-item
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;  // #rhs
-    const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;  // # row
-    const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    // create two local memory arrays used for caching
+    __local real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
-    // create a thread private array used for internal caching
+    // create a work-item private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (ulong dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ul) {
-        // load data into local memory
-        for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const ulong global_i = i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-            const ulong global_j = j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ul) - (dim + threadIdx_y - (ulong) 1) * (dim + threadIdx_y) / (ulong) 2 + device_specific_num_rows - (dim + get_local_id(1)) + global_j];
-            A_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows - row_offset + PADDING_SIZE_ul) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ul - (ulong) 1) * (dim + get_local_id(1) + THREAD_BLOCK_SIZE_ul) / (ulong) 2 + device_specific_num_rows - (dim + get_local_id(1) + THREAD_BLOCK_SIZE_ul) + global_j];
-            B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ul) + global_i];
-            B_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rhs + PADDING_SIZE_ul) + global_i];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+    {
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_mirror_rows
+
+        // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+        for (ulong dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into local memory
+            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_i_idx_linear = i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                const ulong global_j_idx_linear = j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the local memory
+                A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - (ulong) 1) * (dim_block + threadIdx_y) / (ulong) 2 + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                 // SoA
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-        // perform the feature reduction calculation
-        for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+            // perform the dot product calculation, the dim is the fastest moving index
             for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    real_type sum = 0.0;
+                    for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        sum += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
+                    temp[internal_i][internal_j] += sum;
                 }
             }
+#else
+            // perform the dot product calculation, the dim is the slowest moving index
+            for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
+                }
+            }
+#endif
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
     }
 
+    // calculate the indices used in the current work-item
+    const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
+
     // apply the (remaining) BLAS operation and update C
     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const ulong global_i = i + (ulong) internal_i;
-            const ulong partial_global_j = j + (ulong) internal_j;
-            const ulong global_j = row_offset + device_specific_num_rows + j + (ulong) internal_j;
+            // calculate the indices to access the global data and the data with respect to the current device
+            const ulong global_i_idx = i_idx + (ulong) internal_i;
+            const ulong partial_global_j_idx = j_idx + (ulong) internal_j;
+            const ulong global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
 
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && partial_global_j < num_mirror_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i];
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -196,23 +225,24 @@ __kernel void device_kernel_symm_mirror(const ulong num_rows, const ulong num_rh
  */
 __kernel void device_kernel_inplace_matrix_add(const ulong num_cols, real_type __global *lhs, const real_type __global *rhs, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-    // calculate the indices used in the current thread
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;  // # num_rows
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;  // # num_rhs
+    // calculate the indices used in the current work-item
+    const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const ulong global_i = i + (ulong) internal_i;
-            const ulong global_j = j + (ulong) internal_j;
+            // calculate the indices to access the global data
+            const ulong global_i_idx = i_idx + (ulong) internal_i;
+            const ulong global_j_idx = j_idx + (ulong) internal_j;
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ul) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ul) + global_j];
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx];  // SoA
         }
     }
 }
@@ -227,23 +257,24 @@ __kernel void device_kernel_inplace_matrix_add(const ulong num_cols, real_type _
  */
 __kernel void device_kernel_inplace_matrix_scale(const ulong num_cols, real_type __global *lhs, const real_type scale, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-    // calculate the indices used in the current thread
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;  // # num_rows
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;  // # num_rhs
+    // calculate the indices used in the current work-item
+    const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const ulong global_i = i + (ulong) internal_i;
-            const ulong global_j = j + (ulong) internal_j;
+            // calculate the indices to access the global data
+            const ulong global_i_idx = i_idx + (ulong) internal_i;
+            const ulong global_j_idx = j_idx + (ulong) internal_j;
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ul) + global_j] *= scale;
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale;  // SoA
         }
     }
 }
diff --git a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl
index 481945ca6..34f6afb48 100644
--- a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl
+++ b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl
@@ -14,11 +14,11 @@
 /**
  * @brief Create the explicit kernel matrix using the kernel function determined at runtime.
  * @details The `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER`, `PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION`, and `PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION` placeholder will be replaced by the correct values upon kernel construction.
- * @param[out] kernel_matrix_d the calculated kernel matrix
- * @param[in] data_d the data points to calculate the kernel matrix from
+ * @param[out] kernel_matrix the calculated kernel matrix
+ * @param[in] data the data points to calculate the kernel matrix from
  * @param[in] num_rows the total number of data points (= total number of rows)
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] q the vector used in the dimensional reduction
  * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -27,78 +27,96 @@
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen)
  */
-__kernel void device_kernel_assembly(__global real_type *kernel_matrix_d, const __global real_type *data_d, const ulong num_rows, const ulong device_num_rows, const ulong row_offset, const ulong num_features, const __global real_type *q, const real_type QA_cost, const real_type cost, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
+__kernel void device_kernel_assembly(__global real_type *kernel_matrix, const __global real_type *data, const ulong num_rows, const ulong device_num_rows, const ulong device_row_offset, const ulong num_features, const __global real_type *q, const real_type QA_cost, const real_type cost, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-    // calculate the indices used in the current thread
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    // create two local memory arrays used for caching
+    __local real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
     if (blockIdx_x >= blockIdx_y) {
-        // create a thread private array used for internal caching
+        // create a private memory array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) {
-            // load data into local memory
-            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-                const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
+        {
+            // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+            const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+            const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                data_cache_i[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i];
-                data_cache_i[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i];
-                data_cache_j[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j];
-                data_cache_j[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j];
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const ulong global_i_idx_linear = device_row_offset + i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                    const ulong global_j_idx_linear = device_row_offset + j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the local memory
+                    data_i_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-            // perform the feature reduction calculation
-            for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+                // perform the feature reduction calculation, the feature is the fastest moving index
                 for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_i[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_cache_j[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        real_type sum = 0.0;
+                        for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            sum += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
+                        temp[internal_i][internal_j] += sum;
                     }
                 }
+#else
+                // perform the feature reduction calculation, the feature is the slowest moving index
+                for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                    for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
+                    }
+                }
+#endif
+                barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
             }
-            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
 
+        // calculate the indices used in the current work-item
+        const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+        const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const ulong device_global_i = i + (ulong) internal_i;
-                const ulong global_i = row_offset + i + (ulong) internal_i;
-                const ulong device_global_j = j + (ulong) internal_j;
-                const ulong global_j = row_offset + j + (ulong) internal_j;
+                // calculate the indices to access the global data and the data with respect to the current device
+                const ulong device_global_i_idx = i_idx + (ulong) internal_i;
+                const ulong global_i_idx = device_row_offset + device_global_i_idx;
+                const ulong device_global_j_idx = j_idx + (ulong) internal_j;
+                const ulong global_j_idx = device_row_offset + device_global_j_idx;
 
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                     real_type temp_ij = temp[internal_i][internal_j];
-                    temp_ij = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp_ij PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i] - q[global_j];
+                    // apply the final kernel function
+                    temp_ij = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp_ij PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp_ij += cost;
                     }
-                    // update the kernel matrix
-                    kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ul) - device_global_j * (device_global_j + (ulong) 1) / (ulong) 2 + device_global_i] = temp_ij;
+                    // update the upper triangular kernel matrix
+                    kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + (ulong) 1) / (ulong) 2 + device_global_i_idx] = temp_ij;
                 }
             }
         }
diff --git a/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl b/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl
index cbcbea498..aecb2ab8b 100644
--- a/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl
+++ b/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl
@@ -19,10 +19,10 @@
  * @note The beta factor is already applied to C before this kernel starts!
  * @param[in] alpha the scalar alpha value
  * @param[in] q the vector used in the dimensional reduction
- * @param[in] data_d the data points to calculate the implicit kernel matrix from
+ * @param[in] data the data points to calculate the implicit kernel matrix from
  * @param[in] num_rows the number of data points
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] QA_cost the scalar used in the dimensional reduction
  * @param[in] cost the cost factor the diagonal is scaled with
@@ -33,172 +33,203 @@
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen)
  */
-__kernel void device_kernel_assembly_symm(const real_type alpha, const __global real_type *q, const __global real_type *data_d, const ulong num_rows, const ulong device_num_rows, const ulong row_offset, const ulong num_features, const real_type QA_cost, const real_type cost, const __global real_type *B, __global real_type *C, const ulong num_classes, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
+__kernel void device_kernel_assembly_symm(const real_type alpha, const __global real_type *q, const __global real_type *data, const ulong num_rows, const ulong device_num_rows, const ulong device_row_offset, const ulong num_features, const real_type QA_cost, const real_type cost, const __global real_type *B, __global real_type *C, const ulong num_classes, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-    // calculate the indices used in the current thread
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-    // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+    // calculate the indices used in the current work-item
+    const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+    const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
+    // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+    const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+    const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+    // create two local memory arrays used for caching
+    __local real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+    // only calculate the upper triangular matrix -> can't use threadIdx since all work-items in a warp must progress further
     if (blockIdx_x >= blockIdx_y) {
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) {
-            // load data into local memory
-            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-                const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_i[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i];
-                data_cache_i[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i];
-                data_cache_j[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j];
-                data_cache_j[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j];
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+        //*************************************************************************//
+        //                   inplace kernel matrix construction                    //
+        //*************************************************************************//
+        {
+            // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            __local real_type(*data_i_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one;
+            __local real_type(*data_j_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two;
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const ulong global_i_idx_linear = device_row_offset + i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                    const ulong global_j_idx_linear = device_row_offset + j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
 
-            // perform the feature reduction calculation
-            for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    // store the values in the local memory
+                    data_i_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+                // perform the feature reduction calculation, the feature is the fastest moving index
                 for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_i[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_cache_j[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        real_type sum = 0.0;
+                        for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            sum += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
+                        temp[internal_i][internal_j] += sum;
                     }
                 }
+#else
+                // perform the feature reduction calculation, the feature is the slowest moving index
+                for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                    for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
+                    }
+                }
+#endif
+                barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
             }
-            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
 
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const ulong global_i = row_offset + i + (ulong) internal_i;
-                const ulong device_global_i = i + (ulong) internal_i;
-                const ulong global_j = row_offset + j + (ulong) internal_j;
-                const ulong device_global_j = j + (ulong) internal_j;
-
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) {
-                    temp[internal_i][internal_j] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_i][internal_j] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i] - q[global_j];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const ulong device_global_i_idx = i_idx + (ulong) internal_i;
+                const ulong global_i_idx = device_row_offset + device_global_i_idx;
+                const ulong device_global_j_idx = j_idx + (ulong) internal_j;
+                const ulong global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                    // apply the final kernel function
+                    temp[internal_i][internal_j] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_i][internal_j] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp[internal_i][internal_j] += cost;
                     }
                 } else {
+                    // be sure to set the value to zero otherwise
                     temp[internal_i][internal_j] = (real_type) 0.0;
                 }
             }
         }
 
-        // calculate C += alpha * temp * B for the UPPER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+        //*************************************************************************//
         {
-            // reinterpret cache arrays with interchanged dimensions
-            __local real_type (*B_cache)[FEATURE_BLOCK_SIZE] = (__local real_type (*)[FEATURE_BLOCK_SIZE]) data_cache_i;
-            __local real_type (*C_out_cache)[FEATURE_BLOCK_SIZE] = (__local real_type (*)[FEATURE_BLOCK_SIZE]) data_cache_j;
+            // reinterpret the local memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+            __local real_type(*B_cache)[THREAD_BLOCK_SIZE] = (__local real_type(*)[THREAD_BLOCK_SIZE]) cache_one;
+            __local real_type(*C_out_cache)[THREAD_BLOCK_SIZE] = (__local real_type(*)[THREAD_BLOCK_SIZE]) cache_two;
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (ulong dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ul) {
+            for (ulong class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const ulong global_i_idx_linear = device_row_offset + i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = alpha * B[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y];
-                    B_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1 + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul];
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = (real_type) 0.0;
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1 + THREAD_BLOCK_SIZE] = (real_type) 0.0;
+                    // store the values in the local memory
+                    B_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
+                    C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = (real_type) 0.0;                                                                           // SoA
                 }
                 barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
                 // calculate intermediate results and store them in local memory
-                for (uint class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (uint class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_0) % FEATURE_BLOCK_SIZE] +=
-                                temp[internal_i][internal_j] * B_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_0) % FEATURE_BLOCK_SIZE];
+                            C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_0) % THREAD_BLOCK_SIZE] +=
+                                temp[internal_i][internal_j] * B_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_0) % THREAD_BLOCK_SIZE];
                         }
                     }
                     barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const ulong global_j = row_offset + j + (ulong) internal;
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_x], C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal][local_id_0]);
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ul], C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal][local_id_0 + THREAD_BLOCK_SIZE]);
+                    // calculate the indices to access the global data
+                    const ulong global_j_idx = device_row_offset + j_idx + (ulong) internal;
+
+                    atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal][local_id_0]);  // SoA
                 }
-                barrier(CLK_LOCAL_MEM_FENCE);  // wai until all threads updated C with their values
+                barrier(CLK_LOCAL_MEM_FENCE);  // wai until all work-items updated C with their values
             }
         }
 
         // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
         for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const ulong global_i = row_offset + i + (ulong) internal_i;
-                const ulong global_j = row_offset + j + (ulong) internal_j;
+                // calculate the indices to access the global data
+                const ulong global_i_idx = device_row_offset + i_idx + (ulong) internal_i;
+                const ulong global_j_idx = device_row_offset + j_idx + (ulong) internal_j;
 
-                if (global_i == global_j) {
+                if (global_i_idx == global_j_idx) {
                     temp[internal_i][internal_j] = (real_type) 0.0;
                 }
             }
         }
 
-        // calculate C += alpha * temp * B for the LOWER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+        //*************************************************************************//
         {
-            // reinterpret cache arrays with interchanged dimensions
-            __local real_type (*B_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type (*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_i;
-            __local real_type (*C_out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type (*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_j;
+            // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            __local real_type(*B_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one;
+            __local real_type(*C_out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two;
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (ulong dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ul) {
+            for (ulong class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const ulong global_j_idx_linear = device_row_offset + j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha * B[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y];
-                    B_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha * B[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul];
+                    // store the values in the local memory
+                    B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
                     C_out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0;
-                    C_out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0;
                 }
                 barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-                // calculate intermediate results and store them in shared memory
-                for (uint class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                // calculate intermediate results and store them in local memory
+                for (uint class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_0] +=
-                                temp[internal_i][internal_j] * B_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j];
+                            C_out_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_0] +=
+                                temp[internal_i][internal_j] * B_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j];
                         }
                     }
                     barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const ulong global_i = row_offset + i + (ulong) internal;
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y], C_out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]);
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul], C_out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0]);
+                    // calculate the indices to access the global data
+                    const ulong global_i_idx = device_row_offset + i_idx + (ulong) internal;
+
+                    atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]);  // SoA
                 }
-                barrier(CLK_LOCAL_MEM_FENCE);   // wait until all threads updated C with their values
+                barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items updated C with their values
             }
         }
     }
diff --git a/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl b/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl
new file mode 100644
index 000000000..76c0ba424
--- /dev/null
+++ b/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl
@@ -0,0 +1,40 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Implement a fill kernel using OpenCL.
+ */
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/**
+ * @brief Fill the float data pointer @p data with the @p value.
+ * @param[out] data the pointer to fill with values
+ * @param[in] value the value used to fill @p data
+ * @param[in] pos the start position for filling @p data
+ * @param[in] size the number of elements in @p data
+ */
+__kernel void device_fill_kernel_float(__global float *data, const float value, const ulong pos, const ulong size) {
+    const ulong idx = get_global_id(0);
+    if (idx < size) {
+        data[pos + idx] = value;
+    }
+}
+
+/**
+ * @brief Fill the double data pointer @p data with the @p value.
+ * @param[out] data the pointer to fill with values
+ * @param[in] value the value used to fill @p data
+ * @param[in] pos the start position for filling @p data
+ * @param[in] size the number of elements in @p data
+ */
+__kernel void device_fill_kernel_double(__global double *data, const double value, const ulong pos, const ulong size) {
+    const ulong idx = get_global_id(0);
+    if (idx < size) {
+        data[pos + idx] = value;
+    }
+}
diff --git a/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl b/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl
new file mode 100644
index 000000000..88b4f67e1
--- /dev/null
+++ b/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl
@@ -0,0 +1,46 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Implement a memset kernel using OpenCL.
+ */
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/**
+ * @brief Memset the float data pointer @p data with the @p value.
+ * @param[out] data the pointer to memset with the pattern
+ * @param[in] pattern the pattern used to memset @p data
+ * @param[in] pos the start position for the memset operation on @p data
+ * @param[in] size the number of elements in @p data
+ */
+__kernel void device_memset_kernel_float(__global float *data, const uchar pattern, const ulong pos, const ulong size) {
+    const ulong idx = get_global_id(0);
+    if (idx < size) {
+        // pack the 1-Byte pattern into a 4-Byte uint
+        const uint packed_pattern = (pattern << 24) | (pattern << 16) | (pattern << 8) | pattern;
+        // bitwise cast the uint to a float
+        data[pos + idx] = as_float(packed_pattern);
+    }
+}
+
+/**
+ * @brief Memset the double data pointer @p data with the @p value.
+ * @param[out] data the pointer to memset with the pattern
+ * @param[in] pattern the pattern used to memset @p data
+ * @param[in] pos the start position for the memset operation on @p data
+ * @param[in] size the number of elements in @p data
+ */
+__kernel void device_memset_kernel_double(__global double *data, const uchar pattern, const ulong pos, const ulong size) {
+    const ulong idx = get_global_id(0);
+    if (idx < size) {
+        // pack the 1-Byte pattern into an 8-Byte ulong
+        const ulong packed_pattern = ((ulong) pattern << 56) | ((ulong) pattern << 48) | ((ulong) pattern << 40) | ((ulong) pattern << 32) | ((ulong) pattern << 24) | ((ulong) pattern << 16) | ((ulong) pattern << 8) | ((ulong) pattern);
+        // bitwise cast th ulong to a double
+        data[pos + idx] = as_double(packed_pattern);
+    }
+}
diff --git a/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl b/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl
index 286c9db05..70b66e305 100644
--- a/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl
+++ b/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl
@@ -75,7 +75,7 @@ real_type apply_linear_kernel_function(const real_type value) {
 
 /**
  * @brief Compute the polynomial kernel function using @p value.
- * @details Uses a custom power implementation taking advantage of the fact that degree can only be a positive integer. Hardcodes the power function for degrees <= 6.
+ * @details Uses a custom power implementation taking advantage of the fact that degree can only be a positive integer.
  * @param[in] value the value to apply the polynomial kernel function to
  * @param[in] degree the degree parameter of the polynomial kernel function
  * @param[in] gamma the gamma parameter of the polynomial kernel function
@@ -84,36 +84,12 @@ real_type apply_linear_kernel_function(const real_type value) {
  */
 real_type apply_polynomial_kernel_function(const real_type value, const int degree, const real_type gamma, const real_type coef0) {
     const real_type base = gamma * value + coef0;
-    switch (degree) {
-        case 0: return (real_type) 1.0;
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result = 1.0;
-                for (int i = 0; i < degree; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result = 1.0;
+    for (int i = 0; i < degree; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 /**
diff --git a/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl b/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl
index e37c1dbfb..42edc442f 100644
--- a/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl
+++ b/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl
@@ -14,13 +14,13 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 /**
- * @brief Predict the @p predict_points_d using the kernel function determined at runtime.
+ * @brief Predict the @p predict_points using the kernel function determined at runtime.
  * @details The `PLSSVM_DEVICE_KERNEL_PREDICT_NAME`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER`, `PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION`, and `PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION` placeholder will be replaced by the correct values upon kernel construction.
- * @param[in] prediction_d the predicted values
- * @param[in] alpha_d the previously learned weights
- * @param[in] rho_d the previously learned biases
- * @param[in] sv_d the support vectors
- * @param[in] predict_points_d the data points to predict
+ * @param[in] prediction the predicted values
+ * @param[in] alpha the previously learned weights
+ * @param[in] rho the previously learned biases
+ * @param[in] support_vectors the support vectors
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
  * @param[in] num_predict_points the number of data points to predict
@@ -29,107 +29,126 @@
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen)
  */
-__kernel void PLSSVM_DEVICE_KERNEL_PREDICT_NAME(__global real_type *prediction_d, const __global real_type *alpha_d, const __global real_type *rho_d, const __global real_type *sv_d, const __global real_type *predict_points_d, const ulong num_classes, const ulong num_sv, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
+__kernel void PLSSVM_DEVICE_KERNEL_PREDICT_NAME(__global real_type *prediction, const __global real_type *alpha, const __global real_type *rho, const __global real_type *support_vectors, const __global real_type *predict_points, const ulong num_classes, const ulong num_sv, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-    // calculate the indices used in the current thread
-    const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-    // create a thread private array used for internal caching
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+    // create two local memory arrays used for caching
+    __local real_type cache_one[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type cache_two[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+    // create a work-item private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) {
-        // load data into local memory
-        for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const ulong global_pp_idx = pp_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-            const ulong global_sv_idx = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            data_cache_pp[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx];
-            data_cache_pp[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx];
-            data_cache_sv[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ul) + global_sv_idx];
-            data_cache_sv[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_sv + PADDING_SIZE_ul) + global_sv_idx];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+    {
+        // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        __local real_type(*pp_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one;
+        __local real_type(*sv_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two;
+
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_predict_points
+        const ulong sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into local memory
+            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_pp_idx_linear = pp_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                const ulong global_sv_idx_linear = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the local memory
+                pp_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                sv_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];             // SoA
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-        // perform the feature reduction calculation
-        for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+            // perform the feature reduction calculation, the feature is the fastest moving index
+            for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    temp[internal_pd][internal_sv] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_sv[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv], data_cache_pp[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pd]);
+                    real_type sum = 0.0;
+                    for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        sum += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(sv_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv], pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                    }
+                    temp[internal_pp][internal_sv] += sum;
                 }
             }
+#else
+            // perform the feature reduction calculation, the feature is the slowest moving index
+            for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                        temp[internal_pp][internal_sv] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(sv_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv], pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                    }
+                }
+            }
+#endif
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
     }
 
     // update temp using the respective kernel function
-    for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-            temp[internal_pd][internal_sv] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_pd][internal_sv] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER);
+            temp[internal_pp][internal_sv] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_pp][internal_sv] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER);
         }
     }
 
     {
-        // reinterpret cache arrays with interchanged dimensions
-        __local real_type(*alpha_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_pp;
-        __local real_type(*out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_sv;
+        // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        __local real_type(*alpha_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one;
+        __local real_type(*out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two;
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (ulong dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ul) {
+        // calculate the indices used in the current thread
+        const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const ulong sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+        // iterate over all classes using blocking to be able to cache them for faster memory accesses
+        for (ulong class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             // load data into local memory
             for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const ulong global_sv_idx = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                alpha_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ul) + global_sv_idx];
-                alpha_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_sv + PADDING_SIZE_ul) + global_sv_idx];
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_sv_idx_linear = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
 
+                // store the values in the local memory
+                alpha_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                 // the bias (rho) must only be applied once for all support vectors
                 if (blockIdx_y == (ulong) 0) {
-                    out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = -rho_d[dim + threadIdx_y];
-                    out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ul];
+                    out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = -rho[class_block + threadIdx_y];
                 } else {
                     out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0;
-                    out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0;
                 }
             }
             barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
             // calculate intermediate results and store them in shared memory
-            for (uint class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
-                for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (uint class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        out_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_0] +=
-                            temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv];
+                        out_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_0] +=
+                            temp[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv];
                     }
                 }
                 barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
             }
 
-            // add intermediate cached results to prediction_d
+            // atomically add the intermediate cached results to the prediction
             for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data
                 const ulong global_pp_idx = pp_idx + (ulong) internal;
 
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y], out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]);
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul], out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0]);
+                atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]);
             }
             barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items updated their part of the prediction
         }
diff --git a/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl b/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
index 5844b3a3a..1d579b40d 100644
--- a/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
+++ b/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
@@ -14,148 +14,185 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
- * @param[in,out] w_d the vector to speedup the linear prediction
- * @param[in] alpha_d the previously learned weights
- * @param[in] sv_d the support vectors
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
+ * @param[in,out] w the vector to speedup the linear prediction
+ * @param[in] alpha the previously learned weights
+ * @param[in] support_vectors the support vectors
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+ * @param[in] device_num_sv the number of support vectors the current device is responsible for
+ * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__kernel void device_kernel_w_linear(__global real_type *w_d, const __global real_type *alpha_d, const __global real_type *sv_d, const ulong num_classes, const ulong num_sv, const ulong device_specific_num_sv, const ulong sv_offset, const ulong grid_x_offset, const ulong grid_y_offset) {
+__kernel void device_kernel_w_linear(__global real_type *w, const __global real_type *alpha, const __global real_type *support_vectors, const ulong num_classes, const ulong num_sv, const ulong device_num_sv, const ulong device_sv_offset, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-    // calculate the indices used in the current thread
-    const ulong feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-    // create a thread private array used for internal caching
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+    // create two local memory arrays used for caching
+    __local real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+    // create a work-item private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-    // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-    for (ulong sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ul) {
-        // load data into local memory
-        for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const ulong global_feature_idx = feature_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-            const ulong global_class_idx = class_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-            data_cache_feature[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ul) + sv + threadIdx_y];  // SoA
-            data_cache_alpha[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ul) + sv + sv_offset + threadIdx_y];       // AoS
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+    {
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const ulong feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_features
+        const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;    // num_classes
+
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (ulong sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into local memory
+            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_feature_idx_linear = feature_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                const ulong global_class_idx_linear = class_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the local memory
+                feature_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y];  // SoA
+                alpha_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y];    // AoS
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-        // perform the dot product calculation
-        for (uint block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+            // perform the dot product calculation, the sv is the fastest moving index
             for (uint internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                 for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_feature];
+                    real_type sum = 0.0;
+                    for (uint sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                        sum += alpha_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_feature];
+                    }
+                    temp[internal_feature][internal_class] += sum;
                 }
             }
+#else
+            // perform the dot product calculation, the sv is the slowest moving index
+            for (uint sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                for (uint internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                    for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_feature][internal_class] += alpha_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_feature];
+                    }
+                }
+            }
+#endif
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all threads performed their part of the calculations
     }
 
-    // update global array with local one
+    // calculate the indices used in the current work-item
+    const ulong feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+    const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
+
+    // update the global w-vector with the locally cached values
     for (uint internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
         for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+            // calculate the indices to access the global data
             const ulong global_feature_idx = feature_idx + (ulong) internal_feature;
             const ulong global_class_idx = class_idx + (ulong) internal_class;
 
-            w_d[global_feature_idx * (num_classes + PADDING_SIZE_ul) + global_class_idx] = temp[internal_feature][internal_class];
+            w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
         }
     }
 }
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
- * @param[out] prediction_d the predicted values
- * @param[in] w_d the vector to speedup the calculations
- * @param[in] rho_d the previously learned bias
- * @param[in] predict_points_d the data points to predict
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
+ * @param[out] prediction the predicted values
+ * @param[in] w the vector to speedup the calculations
+ * @param[in] rho the previously learned bias
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_predict_points the number of data points to predict
  * @param[in] num_features the number of features per data point
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__kernel void device_kernel_predict_linear(__global real_type *prediction_d, const __global real_type *w_d, const __global real_type *rho_d, const __global real_type *predict_points_d, const ulong num_classes, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset) {
+__kernel void device_kernel_predict_linear(__global real_type *prediction, const __global real_type *w, const __global real_type *rho, const __global real_type *predict_points, const ulong num_classes, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-    // calculate the indices used in the current thread
-    const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-    // create a thread private array used for internal caching
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+    // create two local memory arrays used for caching
+    __local real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+    // create a work-item private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) {
-        // load data into local memory
-        for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const ulong global_pp_idx = pp_idx_linear + internal * THREAD_BLOCK_SIZE;
-            const ulong global_class_idx = class_idx_linear + internal * THREAD_BLOCK_SIZE;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            data_cache_pp[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx];
-            data_cache_pp[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx];
-            data_cache_w[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ul) + global_class_idx];
-            data_cache_w[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_classes + PADDING_SIZE_ul) + global_class_idx];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+    {
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;     // num_predict_points
+        const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_classes
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into local memory
+            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_pp_idx_linear = pp_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                const ulong global_class_idx_linear = class_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the local memory
+                pp_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                w_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-        // perform the dot product calculation
-        for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+            // perform the feature reduction calculation, the feature is the fastest moving index
+            for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_pd][internal_class] += data_cache_w[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pd];
+                    real_type sum = 0.0;
+                    for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        sum += w_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp];
+                    }
+                    temp[internal_pp][internal_class] += sum;
                 }
             }
+#else
+            // perform the feature reduction calculation, the feature is the slowest moving index
+            for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_pp][internal_class] += w_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp];
+                    }
+                }
+            }
+#endif
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
     }
 
-    // update global array with local one
-    for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    // calculate the indices used in the current work-item
+    const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+    const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+    // update the global array with the local one
+    for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-            const ulong global_pp_idx = pp_idx + (ulong) internal_pd;
+            // calculate the indices to access the global data
+            const ulong global_pp_idx = pp_idx + (ulong) internal_pp;
             const ulong global_class_idx = class_idx + (ulong) internal_class;
 
-            prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ul) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx];
+            prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx];  // AoS
         }
     }
 }
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
index e1041024a..01b1ec54a 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
@@ -21,7 +21,6 @@
 #include <array>    // std::array
 #include <cmath>    // std::ceil
 #include <cstddef>  // std::size_t
-#include <vector>   // std::vector
 
 namespace plssvm::openmp::detail {
 
@@ -29,60 +28,65 @@ namespace plssvm::openmp::detail {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @param[in] num_rows the number of rows and columns in @p A
  * @param[in] num_rhs the number of rows in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
  * @param[in] beta the scalar beta value
  * @param[in,out] C the matrix @p C, also used as result matrix
  */
-inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
     // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
+    PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
 
     // calculate constants
     const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t row = 0; row < blocked_device_specific_num_rows; row += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t row_block = 0; row_block < blocked_device_num_rows; row_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) {
-                for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
+            for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) {
+                for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t rhs_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (rhs_block + rhs_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    // iterate over all features
-                    for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) {
+                    // iterate over all values using blocking
+                    for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                                const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                                real_type A_val = 0.0;
-                                // determine on which side of the diagonal we are located
-                                if (dim < global_row) {
-                                    A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }];
-                                } else {
-                                    A_val = A[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                                real_type sum{ 0.0 };
+                                for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                                    real_type A_cache = 0.0;
+                                    // determine on which side of the diagonal we are located
+                                    if (dim_block + dim < global_j_idx) {
+                                        A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                                    } else {
+                                        A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                                    }
+                                    sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset);
                                 }
-                                temp[internal_i][internal_j] += A_val * B(global_rhs, dim + row_offset);
+                                temp[internal_j][internal_i] += sum;
                             }
                         }
                     }
@@ -90,13 +94,14 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
                     // apply the (partial) BLAS operation and update C
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                            const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                            const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_j);
-
-                            // be sure to not perform out of bounds accesses
-                            if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) {
-                                C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row);
+                            // calculate the indices to access the global data and the data with respect to the current device
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                            const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                            // be sure to not perform out-of-bounds accesses
+                            if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                                C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx);
                             }
                         }
                     }
@@ -111,69 +116,75 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
  * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
  * @param[in] beta the scalar beta value
  * @param[in,out] C the matrix @p C, also used as result matrix
  */
-inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
     // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
+    PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
     PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
 
     // calculate constants
     const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
     const auto blocked_num_mirror_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_mirror_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t row = 0; row < blocked_num_mirror_rows; row += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t row_block = 0; row_block < blocked_num_mirror_rows; row_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) {
-                for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
+            for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) {
+                for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t rhs_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (rhs_block + rhs_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    // iterate over all features
-                    for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) {
+                    // iterate over the remaining values using blocking
+                    for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                                const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                                const real_type A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row];
-                                temp[internal_i][internal_j] += A_val * B(global_rhs, row_offset + dim);
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                                real_type sum{ 0.0 };
+                                for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                                    sum += A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                                           B(global_i_idx, dim_block + dim + device_row_offset);                                                                                                                                                         // SoA
+                                }
+                                temp[internal_j][internal_i] += sum;
                             }
                         }
                     }
 
-                    // apply the (partial) BLAS operation and update C
+                    // apply the (remaining) BLAS operation and update C
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                            const std::size_t partial_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                            const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast<std::size_t>(internal_j);
-
-                            // be sure to not perform out of bounds accesses
-                            if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) {
-                                C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row);
+                            // calculate the indices to access the global data and the data with respect to the current device
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                            const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
+
+                            // be sure to not perform out-of-bounds accesses
+                            if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                                C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx);
                             }
                         }
                     }
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 9403b12a1..70d2f9edb 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -28,83 +28,89 @@ namespace plssvm::openmp::detail {
 
 /**
  * @brief Assemble the kernel matrix using the @p kernel function.
- * @tparam kernel the compile-time kernel function to use
+ * @tparam kernel_function the compile-time kernel function to use
  * @tparam Args the types of the potential additional arguments for the @p kernel function
  * @param[out] kernel_matrix the resulting kernel matrix
  * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] q the `q` vector
  * @param[in] QA_cost he bottom right matrix entry multiplied by cost
  * @param[in] cost 1 / the cost parameter in the C-SVM
  * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
  */
-template <kernel_function_type kernel, typename... Args>
-void device_kernel_assembly(std::vector<real_type> &kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+    PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!");
     PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!");
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
+    PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+    PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
     PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
 
     // calculate constants
     const std::size_t num_rows = data.num_rows() - 1;
     const std::size_t num_features = data.num_cols();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2) schedule(dynamic)
-    for (std::size_t row = 0; row < blocked_row_range; row += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t col = 0; col < blocked_device_specific_num_rows; col += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t col_block = 0; col_block < blocked_device_num_rows; col_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
-                for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) {
+            for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
+                for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t col_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (col_block + col_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // only calculate the upper triangular matrix
-                    if (row_idx >= col_idx) {
+                    if (i_idx >= j_idx) {
                         // create a thread private array used for internal caching
                         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
                         // iterate over all features
-                        for (std::size_t dim = 0; dim < num_features; ++dim) {
+                        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                             // perform the feature reduction calculation
-                            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    // calculate the indices to access the global data
+                                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
-                                    temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data(global_row, dim), data(global_col, dim));
+                                    real_type sum{ 0.0 };
+                                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                        sum += detail::feature_reduce<kernel_function>(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature));
+                                    }
+                                    temp[internal_j][internal_i] += sum;
                                 }
                             }
                         }
 
                         // apply the remaining part of the kernel function and store the value in the output kernel matrix
-                        for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                            for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                                // calculate the indices to access the kernel matrix (the part stored on the current device)
-                                const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                                const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                                const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                                const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data and the data with respect to the current device
+                                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                                const auto global_j_idx = device_row_offset + device_global_j_idx;
 
-                                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                                if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                                    real_type temp_ij = temp[internal_row][internal_col];
-                                    temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col];
+                                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                                    real_type temp_ij = temp[internal_j][internal_i];
+                                    // apply the final kernel function
+                                    temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                                     // apply the cost on the diagonal
-                                    if (global_row == global_col) {
+                                    if (global_i_idx == global_j_idx) {
                                         temp_ij += cost;
                                     }
-                                    // update the kernel matrix
-                                    kernel_matrix[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij;
+                                    // update the upper triangular kernel matrix
+                                    kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                                 }
                             }
                         }
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 771689209..332a0a26a 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -26,25 +26,25 @@
 namespace plssvm::openmp::detail {
 
 /**
- * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
- * @tparam kernel the compile-time kernel function to use
- * @tparam Args the types of the potential additional arguments for the @p kernel function
+ * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @tparam kernel_function the compile-time kernel function to use
+ * @tparam Args the types of the potential additional arguments for the @p kernel_function function
  * @param[in] alpha the scalar alpha value
  * @param[in] q the `q` vector
  * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] QA_cost he bottom right matrix entry multiplied by cost
  * @param[in] cost 1 / the cost parameter in the C-SVM
  * @param[in] B the matrix @p B
  * @param[in,out] C the matrix @p C
- * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
+ * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_assembly_symm(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+inline void device_kernel_assembly_symm(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
     PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
+    PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+    PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
     PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
     PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!");
     PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size());
@@ -53,68 +53,96 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
     const std::size_t num_rows = data.num_rows() - 1;
     const std::size_t num_features = data.num_cols();
     const std::size_t num_classes = B.num_rows();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2) schedule(dynamic)
-    for (std::size_t row = 0; row < blocked_row_range; row += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t col = 0; col < blocked_device_specific_num_rows; col += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t col_block = 0; col_block < blocked_device_num_rows; col_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
-                for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) {
+            for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
+                for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t col_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (col_block + col_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // only calculate the upper triangular matrix
-                    if (row_idx >= col_idx) {
+                    if (i_idx >= j_idx) {
                         // create a thread private array used for internal caching
                         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                        // iterate over all features
-                        for (std::size_t dim = 0; dim < num_features; ++dim) {
-                            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
+                        //*************************************************************************//
+                        //                   inplace kernel matrix construction                    //
+                        //*************************************************************************//
+                        // iterate over all features using blocking
+                        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    // calculate the indices to access the global data
+                                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
-                                    temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data(global_row, dim), data(global_col, dim));
+                                    real_type sum{ 0.0 };
+                                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                        sum += detail::feature_reduce<kernel_function>(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature));
+                                    }
+                                    temp[internal_j][internal_i] += sum;
                                 }
                             }
                         }
 
                         // apply the remaining part of the kernel function and store the value in the output kernel matrix
-                        for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                            for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                                const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                                const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                                const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                                const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                                if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                                    real_type temp_ij = temp[internal_row][internal_col];
-                                    temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col];
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data and the data with respect to the current device
+                                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                                    // apply the final kernel function
+                                    temp[internal_j][internal_i] = detail::apply_kernel_function<kernel_function>(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                                     // apply the cost on the diagonal
-                                    if (global_row == global_col) {
-                                        temp_ij += cost;
-                                        // calculate the values of alpha * A * B
-                                        for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
+                                    if (global_i_idx == global_j_idx) {
+                                        temp[internal_j][internal_i] += cost;
+                                    }
+                                } else {
+                                    // be sure to set the value to zero otherwise
+                                    temp[internal_j][internal_i] = real_type{ 0.0 };
+                                }
+                            }
+                        }
+
+                        //*************************************************************************//
+                        //                     calculate C += alpha * temp * B                     //
+                        //*************************************************************************//
+                        for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    // calculate the indices to access the global data
+                                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                                    if (global_i_idx == global_j_idx) {
+                                        // only apply once to the diagonal
+                                        for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
 #pragma omp atomic
-                                            C(class_idx, global_row) += alpha * temp_ij * B(class_idx, global_row);
+                                            C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);
                                         }
                                     } else {
-                                        // calculate the values of alpha * A * B
-                                        for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
+                                        // apply it for the upper and lower triangular matrix
+                                        for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
 #pragma omp atomic
-                                            C(class_idx, global_row) += alpha * temp_ij * B(class_idx, global_col);
-// symmetry
+                                            C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx);
+                                            // symmetry
 #pragma omp atomic
-                                            C(class_idx, global_col) += alpha * temp_ij * B(class_idx, global_row);
+                                            C(class_block + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);
                                         }
                                     }
                                 }
diff --git a/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp b/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp
index 59fd0f43c..359e2f8ff 100644
--- a/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp
@@ -27,42 +27,17 @@ namespace plssvm::openmp::detail {
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] inline real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index 407096055..741c696af 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -31,28 +31,68 @@ namespace plssvm::openmp::detail {
  * @param[out] w the vector to speedup the linear prediction
  * @param[in] alpha the previously learned weights
  * @param[in] support_vectors the support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for
+ * @param[in] device_num_sv the number of support vectors the current device is responsible for
+ * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for
  */
-inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) {
+inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) {
     PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
     PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }));
-    PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows());
-    PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows());
+    PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows());
+    PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows());
 
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
     const std::size_t num_features = support_vectors.num_cols();
+    const auto blocked_num_features = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_features) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
-#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(num_classes, num_features, device_specific_num_sv, sv_offset)
-    for (std::size_t a = 0; a < num_classes; ++a) {
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
-            real_type temp{ 0.0 };
-#pragma omp simd reduction(+ : temp)
-            for (std::size_t idx = 0; idx < device_specific_num_sv; ++idx) {
-                temp = std::fma(alpha(a, sv_offset + idx), support_vectors(sv_offset + idx, dim), temp);
+    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+
+#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, device_sv_offset)
+    for (std::size_t feature_block = 0; feature_block < blocked_num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
+            // perform operations on the current block
+            for (std::size_t feature_thread = 0; feature_thread < THREAD_BLOCK_SIZE_uz; ++feature_thread) {
+                for (std::size_t class_thread = 0; class_thread < THREAD_BLOCK_SIZE_uz; ++class_thread) {
+                    // calculate the indices used in the current thread
+                    const std::size_t feature_idx = (feature_block + feature_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t class_idx = (class_block + class_thread) * INTERNAL_BLOCK_SIZE_uz;
+
+                    // create a thread private array used for internal caching
+                    std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+                    // iterate over all support vectors using blocking
+                    for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
+                        // perform the dot product calculation
+                        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                // calculate the indices to access the global data
+                                const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                                real_type sum{ 0.0 };
+                                for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                                    sum += alpha(global_class_idx, device_sv_offset + sv_block + sv) * support_vectors(device_sv_offset + sv_block + sv, global_feature_idx);
+                                }
+                                temp[internal_class][internal_feature] += sum;
+                            }
+                        }
+                    }
+
+                    // store the result back to the w vector
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                            w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature];
+                        }
+                    }
+                }
             }
-            w(a, dim) = temp;
         }
     }
 }
@@ -63,29 +103,69 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
  * @param[in] w the vector to speedup the calculations
  * @param[in] rho the previously learned bias
  * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
+ * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+ * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
  */
-inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) {
+inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) {
     PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size());
     PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols());
     PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
 
     // calculate constants
     const std::size_t num_classes = prediction.num_cols();
     const std::size_t num_features = predict_points.num_cols();
+    const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
-#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(num_classes, num_features, device_specific_num_predict_points, row_offset)
-    for (std::size_t point_index = 0; point_index < device_specific_num_predict_points; ++point_index) {
-        for (std::size_t a = 0; a < num_classes; ++a) {
-            real_type temp{ 0.0 };
-#pragma omp simd reduction(+ : temp)
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
-                temp = std::fma(w(a, dim), predict_points(row_offset + point_index, dim), temp);
+    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+
+#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(blocked_device_num_predict_points, blocked_num_classes, device_num_predict_points, num_classes, num_features, device_row_offset)
+    for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
+            // perform operations on the current block
+            for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) {
+                for (std::size_t class_thread = 0; class_thread < THREAD_BLOCK_SIZE_uz; ++class_thread) {
+                    // calculate the indices used in the current thread
+                    const std::size_t pp_idx = (pp_block + pp_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t class_idx = (class_block + class_thread) * INTERNAL_BLOCK_SIZE_uz;
+
+                    // create a thread private array used for internal caching
+                    std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+                    // iterate over all features using blocking
+                    for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                        // perform the dot product calculation
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                // calculate the indices to access the global data
+                                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                                real_type sum{ 0.0 };
+                                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                    sum += w(global_class_idx, feature_block + feature) * predict_points(global_pp_idx, feature_block + feature);
+                                }
+                                temp[internal_class][internal_pp] += sum;
+                            }
+                        }
+                    }
+
+                    // update the global array with the local one
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                            prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx];
+                        }
+                    }
+                }
             }
-            prediction(row_offset + point_index, a) = temp - rho[a];
         }
     }
 }
@@ -99,59 +179,64 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
  * @param[in] rho the previously learned bias
  * @param[in] support_vectors the support vectors
  * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
+ * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+ * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) {
     PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size());
     PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
     PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols());
     PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
 
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
     const std::size_t num_support_vectors = support_vectors.num_rows();
-    const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_features = predict_points.num_cols();
+    const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t point_index = 0; point_index < device_specific_num_predict_points; ++point_index) {
-        for (std::size_t a = 0; a < num_classes; ++a) {
-            prediction(row_offset + point_index, a) -= rho[a];
+    for (std::size_t pp_idx = 0; pp_idx < device_num_predict_points; ++pp_idx) {
+        for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
+            prediction(device_row_offset + pp_idx, class_idx) -= rho[class_idx];
         }
     }
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t pp = 0; pp < blocked_device_specific_num_predict_points; pp += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t sv = 0; sv < blocked_num_support_vectors; sv += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t sv_block = 0; sv_block < blocked_num_support_vectors; sv_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t pp_block = 0; pp_block < THREAD_BLOCK_SIZE_uz; ++pp_block) {
-                for (std::size_t sv_block = 0; sv_block < THREAD_BLOCK_SIZE_uz; ++sv_block) {
+            for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) {
+                for (std::size_t sv_thread = 0; sv_thread < THREAD_BLOCK_SIZE_uz; ++sv_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t pp_idx = (pp + pp_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t sv_idx = (sv + sv_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t pp_idx = (pp_block + pp_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t sv_idx = (sv_block + sv_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    // iterate over all features
-                    for (std::size_t dim = 0; dim < num_features; ++dim) {
+                    // iterate over all features using blocking
+                    for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the feature reduction calculation
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                                const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+                                // calculate the indices to access the global data
+                                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                                const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                                temp[internal_pp][internal_sv] += detail::feature_reduce<kernel>(support_vectors(global_sv_idx, dim), predict_points(global_pp_idx, dim));
+                                real_type sum{ 0.0 };
+                                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature));
+                                }
+                                temp[internal_sv][internal_pp] += sum;
                             }
                         }
                     }
@@ -159,22 +244,21 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                     // update temp using the respective kernel function
                     for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel>(temp[internal_pp][internal_sv], kernel_function_parameter...);
+                            temp[internal_sv][internal_pp] = detail::apply_kernel_function<kernel_function>(temp[internal_sv][internal_pp], kernel_function_parameter...);
                         }
                     }
 
-                    // add results to prediction
-                    for (std::size_t a = 0; a < num_classes; ++a) {
+                    // atomically add the results to the prediction
+                    for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                const std::size_t device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                                const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                                const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+                                // calculate the indices to access the global data and the data with respect to the current device
+                                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                                const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                                // be sure to not perform out of bounds accesses
-                                if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) {
+                                for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
 #pragma omp atomic
-                                    prediction(global_pp_idx, a) += alpha(a, global_sv_idx) * temp[internal_pp][internal_sv];
+                                    prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_sv][internal_pp];
                                 }
                             }
                         }
diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
index 55b6a746b..b21d95619 100644
--- a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
+++ b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
@@ -18,7 +18,7 @@
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp"     // plssvm::adaptivecpp::detail::device_ptr
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/pinned_memory.hpp"  // plssvm::adaptivecpp::detail::pinned_memory
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue.hpp"          // plssvm::adaptivecpp::detail::queue (PImpl)
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"           // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"             // plssvm::sycl::data_parallel_kernel
 #include "plssvm/constants.hpp"                                       // plssvm::real_type
 #include "plssvm/detail/igor_utility.hpp"                             // plssvm::detail::get_value_from_named_parameter
 #include "plssvm/detail/memory_size.hpp"                              // plssvm::detail::memory_size
@@ -61,7 +61,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
 
     /**
      * @brief Construct a new C-SVM using the SYCL backend on the @p target platform and the optionally provided @p named_args.
-     * @details Additionally sets the SYCL specific kernel invocation type.
+     * @details Additionally sets the SYCL specific data parallel kernel.
      * @param[in] target the target platform used for this C-SVM
      * @param[in] named_args the additional optional named arguments
      * @throws plssvm::exception all exceptions thrown in the base class constructor
@@ -73,16 +73,16 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
         // check igor parameter
         igor::parser parser{ std::forward<Args>(named_args)... };
 
-        // check whether a specific SYCL kernel invocation type has been requested
-        if constexpr (parser.has(sycl_kernel_invocation_type)) {
+        // check whether a specific SYCL data parallel kernel has been requested
+        if constexpr (parser.has(sycl_data_parallel_kernel)) {
             // compile time check: the value must have the correct type
-            invocation_type_ = ::plssvm::detail::get_value_from_named_parameter<sycl::kernel_invocation_type>(parser, sycl_kernel_invocation_type);
+            data_parallel_kernel_type_ = ::plssvm::detail::get_value_from_named_parameter<sycl::data_parallel_kernel>(parser, sycl_data_parallel_kernel);
 
 #if !defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-            if (invocation_type_ == sycl::kernel_invocation_type::hierarchical) {
-                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::hierarchical is disabled for the AdaptiveCpp SYCL backend!" };
-            } else if (invocation_type_ == sycl::kernel_invocation_type::scoped) {
-                throw ::plssvm::invalid_parameter_exception{ "he provided sycl::kernel_invocation_type::scoped is disabled for the AdaptiveCpp SYCL backend!" };
+            if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::hierarchical) {
+                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::data_parallel_kernel::hierarchical is disabled for the AdaptiveCpp SYCL backend!" };
+            } else if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::scoped) {
+                throw ::plssvm::invalid_parameter_exception{ "he provided sycl::data_parallel_kernel::scoped is disabled for the AdaptiveCpp SYCL backend!" };
             }
 #endif
         }
@@ -112,10 +112,10 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
     ~csvm() override = 0;
 
     /**
-     * @brief Return the kernel invocation type used in this SYCL SVM.
-     * @return the SYCL kernel invocation type (`[[nodiscard]]`)
+     * @brief Return the data parallel kernel used in this SYCL SVM.
+     * @return the SYCL data parallel kernel (`[[nodiscard]]`)
      */
-    [[nodiscard]] sycl::kernel_invocation_type get_kernel_invocation_type() const noexcept { return invocation_type_; }
+    [[nodiscard]] sycl::data_parallel_kernel get_data_parallel_kernel() const noexcept { return data_parallel_kernel_type_; }
 
   protected:
     /**
@@ -150,7 +150,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
@@ -180,8 +180,8 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
      */
     [[nodiscard]] device_ptr_type run_predict_kernel(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const final;
 
-    /// The SYCL kernel invocation type for the svm kernel.
-    sycl::kernel_invocation_type invocation_type_{ sycl::kernel_invocation_type::automatic };
+    /// The used SYCL data parallel kernel.
+    sycl::data_parallel_kernel data_parallel_kernel_type_{ sycl::data_parallel_kernel::automatic };
 };
 
 /**
diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp
index e9096f984..dabdc4071 100644
--- a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp
+++ b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp
@@ -33,6 +33,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,21 +59,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
      * @brief Allocates `size * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(size_type size, const queue &q);
+    device_ptr(size_type size, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, const queue &q);
+    device_ptr(plssvm::shape shape, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp
index 23ffb1872..32164ed8b 100644
--- a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp
+++ b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/execution_range.hpp"                // plssvm::detail::dim_type
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue.hpp"  // plssvm::adaptivecpp::detail::queue (PImpl)
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"   // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"     // plssvm::sycl::data_parallel_kernel
 #include "plssvm/detail/utility.hpp"                          // plssvm::detail::unreachable
 #include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
 
@@ -50,21 +50,21 @@ template <std::size_t I>
 
 /**
  * @brief Convert the provided @p grid and @p block to the final SYCL execution range.
- * @tparam invocation_type the SYCL kernel invocation type
+ * @tparam kernel_type the SYCL data parallel kernel
  * @param[in] grid the execution grid
  * @param[in] block the execution block
  * @return the SYCL native execution range
  */
-template <sycl::kernel_invocation_type invocation_type>
+template <sycl::data_parallel_kernel kernel_type>
 auto get_execution_range(const ::plssvm::detail::dim_type &grid, const ::plssvm::detail::dim_type &block) {
     const ::sycl::range native_grid = detail::dim_type_to_native<2>(grid);
     const ::sycl::range native_block = detail::dim_type_to_native<2>(block);
 
-    if constexpr (invocation_type == sycl::kernel_invocation_type::basic) {
+    if constexpr (kernel_type == sycl::data_parallel_kernel::basic) {
         return ::sycl::range<2>{ native_grid * native_block };
-    } else if constexpr (invocation_type == sycl::kernel_invocation_type::work_group) {
+    } else if constexpr (kernel_type == sycl::data_parallel_kernel::work_group) {
         return ::sycl::nd_range<2>{ native_grid * native_block, native_block };
-    } else if constexpr (invocation_type == sycl::kernel_invocation_type::hierarchical || invocation_type == sycl::kernel_invocation_type::scoped) {
+    } else if constexpr (kernel_type == sycl::data_parallel_kernel::hierarchical || kernel_type == sycl::data_parallel_kernel::scoped) {
         return ::sycl::nd_range<2>{ native_grid, native_block };
     } else {
         // can't be reached
diff --git a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
index 4b1a6b570..f322cb877 100644
--- a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
+++ b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
@@ -15,10 +15,10 @@
 
 #include "plssvm/backends/execution_range.hpp"                  // plssvm::detail::{dim_type, execution_range}
 #include "plssvm/backends/gpu_csvm.hpp"                         // plssvm::detail::gpu_csvm
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"       // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp"     // plssvm::dpcpp::detail::device_ptr
 #include "plssvm/backends/SYCL/DPCPP/detail/pinned_memory.hpp"  // plssvm::dpcpp::detail::pinned_memory
 #include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp"          // plssvm::dpcpp::detail::queue (PImpl)
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"     // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                                 // plssvm::real_type
 #include "plssvm/detail/igor_utility.hpp"                       // plssvm::detail::get_value_from_named_parameter
 #include "plssvm/detail/memory_size.hpp"                        // plssvm::detail::memory_size
@@ -64,7 +64,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
      * @param[in] target the target platform used for this C-SVM
      * @param[in] named_args the additional optional named arguments
      * @throws plssvm::exception all exceptions thrown in the base class constructor
-     * @throws plssvm::invalid_parameter_exception the provided SYCL kernel invocation type is "scoped"
+     * @throws plssvm::invalid_parameter_exception the provided SYCL data parallel kernel is "scoped"
      * @throws plssvm::dpcpp::backend_exception if the requested target is not available
      * @throws plssvm::dpcpp::backend_exception if no device for the requested target was found
      */
@@ -73,18 +73,18 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
         // check igor parameter
         igor::parser parser{ std::forward<Args>(named_args)... };
 
-        // check whether a specific SYCL kernel invocation type has been requested
-        if constexpr (parser.has(sycl_kernel_invocation_type)) {
+        // check whether a specific SYCL data parallel kernel has been requested
+        if constexpr (parser.has(sycl_data_parallel_kernel)) {
             // compile time check: the value must have the correct type
-            invocation_type_ = ::plssvm::detail::get_value_from_named_parameter<sycl::kernel_invocation_type>(parser, sycl_kernel_invocation_type);
-            // the invocation type "scoped" isn't supported by DPC++
-            if (invocation_type_ == sycl::kernel_invocation_type::scoped) {
-                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::scoped isn't supported by DPC++!" };
+            data_parallel_kernel_type_ = ::plssvm::detail::get_value_from_named_parameter<sycl::data_parallel_kernel>(parser, sycl_data_parallel_kernel);
+            // the data parallel kernel "scoped" isn't supported by DPC++
+            if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::scoped) {
+                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::data_parallel_kernel::scoped isn't supported by DPC++!" };
             }
 
 #if !defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-            if (invocation_type_ == sycl::kernel_invocation_type::hierarchical) {
-                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::hierarchical is disabled for the DPC++ SYCL backend!" };
+            if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::hierarchical) {
+                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::data_parallel_kernel::hierarchical is disabled for the DPC++ SYCL backend!" };
             }
 #endif
         }
@@ -114,10 +114,10 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
     ~csvm() override = 0;
 
     /**
-     * @brief Return the kernel invocation type used in this SYCL SVM.
-     * @return the SYCL kernel invocation type (`[[nodiscard]]`)
+     * @brief Return the data parallel kernel used in this SYCL SVM.
+     * @return the SYCL data parallel kernel (`[[nodiscard]]`)
      */
-    [[nodiscard]] sycl::kernel_invocation_type get_kernel_invocation_type() const noexcept { return invocation_type_; }
+    [[nodiscard]] sycl::data_parallel_kernel get_data_parallel_kernel() const noexcept { return data_parallel_kernel_type_; }
 
   protected:
     /**
@@ -153,7 +153,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
@@ -183,8 +183,8 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
      */
     [[nodiscard]] device_ptr_type run_predict_kernel(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const final;
 
-    /// The SYCL kernel invocation type for the svm kernel.
-    sycl::kernel_invocation_type invocation_type_{ sycl::kernel_invocation_type::automatic };
+    /// The used SYCL data parallel kernel.
+    sycl::data_parallel_kernel data_parallel_kernel_type_{ sycl::data_parallel_kernel::automatic };
 };
 
 /**
diff --git a/include/plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp b/include/plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp
index 96cb4cf63..4dcf86926 100644
--- a/include/plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp
+++ b/include/plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp
@@ -33,6 +33,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,21 +59,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
      * @brief Allocates `size * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(size_type size, const queue &q);
+    device_ptr(size_type size, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, const queue &q);
+    device_ptr(plssvm::shape shape, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp b/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp
index d61a73407..2adead55f 100644
--- a/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp
+++ b/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp
@@ -13,11 +13,11 @@
 #define PLSSVM_BACKENDS_SYCL_DPCPP_DETAIL_UTILITY_HPP_
 #pragma once
 
-#include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
-#include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp"       // plssvm::dpcpp::detail::queue (PImpl)
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/detail/utility.hpp"                         // plssvm::detail::unreachable
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backends/execution_range.hpp"             // plssvm::detail::dim_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp"     // plssvm::dpcpp::detail::queue (PImpl)
+#include "plssvm/detail/utility.hpp"                       // plssvm::detail::unreachable
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range
 
@@ -51,21 +51,21 @@ template <std::size_t I>
 
 /**
  * @brief Convert the provided @p grid and @p block to the final SYCL execution range.
- * @tparam invocation_type the SYCL kernel invocation type
+ * @tparam kernel_type the SYCL data parallel kernel
  * @param[in] grid the execution grid
  * @param[in] block the execution block
  * @return the SYCL native execution range
  */
-template <sycl::kernel_invocation_type invocation_type>
+template <sycl::data_parallel_kernel kernel_type>
 auto get_execution_range(const ::plssvm::detail::dim_type &grid, const ::plssvm::detail::dim_type &block) {
     const ::sycl::range native_grid = detail::dim_type_to_native<2>(grid);
     const ::sycl::range native_block = detail::dim_type_to_native<2>(block);
 
-    if constexpr (invocation_type == sycl::kernel_invocation_type::basic) {
+    if constexpr (kernel_type == sycl::data_parallel_kernel::basic) {
         return ::sycl::range<2>{ native_grid * native_block };
-    } else if constexpr (invocation_type == sycl::kernel_invocation_type::work_group) {
+    } else if constexpr (kernel_type == sycl::data_parallel_kernel::work_group) {
         return ::sycl::nd_range<2>{ native_grid * native_block, native_block };
-    } else if constexpr (invocation_type == sycl::kernel_invocation_type::hierarchical) {
+    } else if constexpr (kernel_type == sycl::data_parallel_kernel::hierarchical) {
         return ::sycl::nd_range<2>{ native_grid, native_block };
     } else {
         // can't be reached
diff --git a/include/plssvm/backends/SYCL/kernel_invocation_types.hpp b/include/plssvm/backends/SYCL/data_parallel_kernels.hpp
similarity index 54%
rename from include/plssvm/backends/SYCL/kernel_invocation_types.hpp
rename to include/plssvm/backends/SYCL/data_parallel_kernels.hpp
index d7cec1924..ede8ee3fb 100644
--- a/include/plssvm/backends/SYCL/kernel_invocation_types.hpp
+++ b/include/plssvm/backends/SYCL/data_parallel_kernels.hpp
@@ -6,11 +6,11 @@
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
  *
- * @brief Defines an enumeration holding all possible SYCL kernel invocation types.
+ * @brief Defines an enumeration holding all possible SYCL data parallel kernels.
  */
 
-#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_INVOCATION_TYPE_HPP_
-#define PLSSVM_BACKENDS_SYCL_KERNEL_INVOCATION_TYPE_HPP_
+#ifndef PLSSVM_BACKENDS_SYCL_DATA_PARALLEL_KERNELS_HPP_
+#define PLSSVM_BACKENDS_SYCL_DATA_PARALLEL_KERNELS_HPP_
 #pragma once
 
 #include "fmt/base.h"     // fmt::formatter
@@ -22,10 +22,10 @@
 namespace plssvm::sycl {
 
 /**
- * @brief Enum class for all possible SYCL kernel invocation types.
+ * @brief Enum class for all possible SYCL data parallel kernels.
  */
-enum class kernel_invocation_type {
-    /** Use the best kernel invocation type for the current SYCL implementation and target hardware platform. */
+enum class data_parallel_kernel {
+    /** Use the best data parallel kernel for the current SYCL implementation and target hardware platform. In practice, will nearly always map to work-group data parallel kernels. */
     automatic,
     /** Use the [`basic` data parallel kernels](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_basic_data_parallel_kernels). */
     basic,
@@ -38,35 +38,35 @@ enum class kernel_invocation_type {
 };
 
 /**
- * @brief Return a list of all currently available SYCL kernel invocation types.
- * @details SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation type can be disabled during the CMake configuration.
- * @return the available SYCL kernel invocation types (`[[nodiscard]]`)
+ * @brief Return a list of all currently available SYCL data parallel kernels.
+ * @details SYCL's hierarchical data parallel kernels and AdaptiveCpp's scoped parallelism can be disabled during the CMake configuration.
+ * @return the available SYCL data parallel kernels (`[[nodiscard]]`)
  */
-[[nodiscard]] std::vector<kernel_invocation_type> list_available_sycl_kernel_invocation_types();
+[[nodiscard]] std::vector<data_parallel_kernel> list_available_sycl_data_parallel_kernels();
 
 /**
- * @brief Output the @p invocation type to the given output-stream @p out.
+ * @brief Output the @p kernel_type type to the given output-stream @p out.
  * @param[in,out] out the output-stream to write the backend type to
- * @param[in] invocation the SYCL kernel invocation type
+ * @param[in] kernel_type the SYCL data parallel kernel
  * @return the output-stream
  */
-std::ostream &operator<<(std::ostream &out, kernel_invocation_type invocation);
+std::ostream &operator<<(std::ostream &out, data_parallel_kernel kernel_type);
 
 /**
- * @brief Use the input-stream @p in to initialize the @p invocation type.
+ * @brief Use the input-stream @p in to initialize the @p kernel_type type.
  * @param[in,out] in input-stream to extract the backend type from
- * @param[in] invocation the SYCL kernel invocation type
+ * @param[in] kernel_type the SYCL data parallel kernel
  * @return the input-stream
  */
-std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation);
+std::istream &operator>>(std::istream &in, data_parallel_kernel &kernel_type);
 
 }  // namespace plssvm::sycl
 
 /// @cond Doxygen_suppress
 
 template <>
-struct fmt::formatter<plssvm::sycl::kernel_invocation_type> : fmt::ostream_formatter { };
+struct fmt::formatter<plssvm::sycl::data_parallel_kernel> : fmt::ostream_formatter { };
 
 /// @endcond
 
-#endif  // PLSSVM_BACKENDS_SYCL_KERNEL_INVOCATION_TYPE_HPP_
+#endif  // PLSSVM_BACKENDS_SYCL_DATA_PARALLEL_KERNELS_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
index 2e528149c..120f637b9 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::item
 
@@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::basic {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -41,11 +48,11 @@ class device_kernel_symm {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -59,33 +66,63 @@ class device_kernel_symm {
      * @param[in] idx indices representing the current point in the execution space
      */
     void operator()(::sycl::item<2> idx) const {
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < (num_rows_ - row_offset_); ++dim) {
-            // perform the dot product calculation
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = j + static_cast<std::size_t>(internal_j);
-
-                    real_type A_val = 0.0;
-                    // determine on which side of the diagonal we are located
-                    if (dim < global_j) {
-                        A_val = A_[dim * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }];
-                    } else {
-                        A_val = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+        // iterate over all values using blocking
+        for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) {
+            if constexpr (target == target_platform::cpu) {
+                // perform the dot product calculation, the dim is the fastest moving index
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                            real_type A_cache = 0.0;
+                            // determine on which side of the diagonal we are located
+                            if (dim_block + dim < global_j_idx) {
+                                A_cache = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                            } else {
+                                A_cache = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                            }
+
+                            sum += A_cache * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
+                        }
+                        temp[internal_i][internal_j] += sum;
+                    }
+                }
+            } else {
+                // perform the dot product calculation, the dim is the slowest moving index
+                for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                            real_type A_cache = 0.0;
+                            // determine on which side of the diagonal we are located
+                            if (dim_block + dim < global_j_idx) {
+                                A_cache = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                            } else {
+                                A_cache = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                            }
+
+                            temp[internal_i][internal_j] += A_cache * B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
+                        }
                     }
-
-                    temp[internal_i][internal_j] += A_val * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
                 }
             }
         }
@@ -93,13 +130,14 @@ class device_kernel_symm {
         // apply the (partial) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -109,8 +147,8 @@ class device_kernel_symm {
     /// @cond Doxygen_suppress
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -125,16 +163,21 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  *          Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm_mirror {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
      * @param[in] num_mirror_rows the number of rows to mirror down
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -143,12 +186,12 @@ class device_kernel_symm_mirror {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -162,25 +205,49 @@ class device_kernel_symm_mirror {
      * @param[in] idx indices representing the current point in the execution space
      */
     void operator()(::sycl::item<2> idx) const {
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_specific_num_rows_; ++dim) {
-            // perform the feature reduction calculation
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = j + static_cast<std::size_t>(internal_j);
-
-                    temp[internal_i][internal_j] += A_[(dim) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows_ - dim + global_j] * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+        // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) {
+            if constexpr (target == target_platform::cpu) {
+                // perform the dot product calculation, the dim is the fastest moving index
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                            sum += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                                   B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];                                                                                                                         // SoA
+                        }
+                        temp[internal_i][internal_j] += sum;
+                    }
+                }
+            } else {
+                // perform the dot product calculation, the dim is the slowest moving index
+                for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                            temp[internal_i][internal_j] += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                                                            B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];                                                                                                                         // SoA
+                        }
+                    }
                 }
             }
         }
@@ -188,13 +255,14 @@ class device_kernel_symm_mirror {
         // apply the (remaining) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto partial_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -205,8 +273,8 @@ class device_kernel_symm_mirror {
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
     const std::size_t num_mirror_rows_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -223,6 +291,9 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in both matrices
@@ -244,19 +315,21 @@ class device_kernel_inplace_matrix_add {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx];  // SoA
             }
         }
     }
@@ -277,6 +350,9 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in the matrix
@@ -298,19 +374,21 @@ class device_kernel_inplace_matrix_scale {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_;  // SoA
             }
         }
     }
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
index 65587ddaa..6e1c99e65 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
@@ -13,9 +13,11 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::item
 
@@ -27,19 +29,23 @@ namespace plssvm::sycl::detail::basic {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] kernel_matrix_d the calculated kernel matrix
-     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[out] kernel_matrix the calculated kernel matrix
+     * @param[in] data the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] q the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -48,19 +54,19 @@ class device_kernel_assembly {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        kernel_matrix_d_{ kernel_matrix_d },
-        data_d_{ data_d },
+    device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        kernel_matrix_{ kernel_matrix },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         q_{ q },
         QA_cost_{ QA_cost },
         cost_{ cost },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } {
     }
 
     /**
@@ -69,25 +75,50 @@ class device_kernel_assembly {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
-        if (i >= j) {
-            // create a work-item private array used for internal caching
+        // only calculate the upper triangular matrix
+        if (i_idx >= j_idx) {
+            // create a private memory array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-            for (std::size_t dim = 0; dim < num_features_; ++dim) {
-                // perform the feature reduction calculation
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i],
-                                                                                                data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]);
+            // iterate over all features using blocking
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the feature reduction calculation, the feature is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                sum += detail::feature_reduce<kernel_function>(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                               data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the feature reduction calculation, the feature is the slowest moving index
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
+
+                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                                        data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                            }
+                        }
                     }
                 }
             }
@@ -95,22 +126,23 @@ class device_kernel_assembly {
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    // calculate the indices to access the kernel matrix (the part stored on the current device)
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
                         real_type temp_ij = temp[internal_i][internal_j];
-                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        // apply the final kernel function
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp_ij += cost_;
                         }
-                        // update the kernel matrix
-                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                        // update the upper triangular kernel matrix
+                        kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                     }
                 }
             }
@@ -119,11 +151,11 @@ class device_kernel_assembly {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *kernel_matrix_d_;
-    const real_type *data_d_;
+    real_type *kernel_matrix_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type *q_;
     const real_type QA_cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
index de6358ec8..f1c3e8945 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
 
@@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::hierarchical {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -41,11 +48,11 @@ class device_kernel_symm {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -59,36 +66,15 @@ class device_kernel_symm {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> i{ group };
-        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> j{ group };
-        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
+        // create two local memory arrays used for caching
+        real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
-        // initialize private and local variables
+        // initialize private temp matrix to zero
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
             // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -97,37 +83,44 @@ class device_kernel_symm {
             }
         });
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+        // iterate over all values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
             // load data into local memory
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                const std::size_t threadIdx_x = idx.get_local_id(0);
+                // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rhs
+                const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                    // store the values in the local memory
                     // determine on which side of the diagonal we are located
-                    if (dim + threadIdx_x < global_j) {
-                        A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];
-                    } else {
-                        A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
-                    }
-                    // determine on which side of the diagonal we are located
-                    if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) {
-                        A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    if (dim_block + threadIdx_x < global_j_idx_linear) {
+                        A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                     } else {
-                        A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                        A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                     }
 
-                    B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                    B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
                 }
             });
 
@@ -135,13 +128,28 @@ class device_kernel_symm {
 
             // perform the dot product calculation
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                            temp(idx)[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
                         }
                     }
                 }
@@ -152,17 +160,31 @@ class device_kernel_symm {
 
         // apply the (partial) BLAS operation and update C
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices to access the global data, pays attention to coalesced memory accesses
+            const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+            const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                    const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                    const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses
-                    if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
-                        C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses
+                    if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) {
+                        C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                     }
                 }
             }
@@ -173,8 +195,8 @@ class device_kernel_symm {
     /// @cond Doxygen_suppress
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -189,16 +211,21 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  *          Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm_mirror {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
      * @param[in] num_mirror_rows the number of rows to mirror down
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -207,12 +234,12 @@ class device_kernel_symm_mirror {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -226,36 +253,15 @@ class device_kernel_symm_mirror {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> i{ group };
-        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> j{ group };
-        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
+        // create two local memory arrays used for caching
+        real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
-        // initialize private and local variables
+        // initialize private temp matrix to zero
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices and diagonal condition
-            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
-            // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                     temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
@@ -263,42 +269,67 @@ class device_kernel_symm_mirror {
             }
         });
 
-        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
-            // load data into shared memory
+        // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+            // load data into local memory
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                const std::size_t threadIdx_x = idx.get_local_id(0);
+                // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                    A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
-                    A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j];
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                    B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    // store the values in the local memory
+                    A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                    B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                                // SoA
                 }
             });
 
             // implicit barrier
 
-            // perform the feature reduction calculation
+            // perform the dot product calculation
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                            temp(idx)[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
                         }
                     }
                 }
@@ -309,17 +340,31 @@ class device_kernel_symm_mirror {
 
         // apply the (remaining) BLAS operation and update C
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices to access the global data
+            const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
 
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                    const auto partial_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                    const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses
-                    if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
-                        C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses
+                    if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) {
+                        C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                     }
                 }
             }
@@ -331,8 +376,8 @@ class device_kernel_symm_mirror {
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
     const std::size_t num_mirror_rows_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -349,6 +394,9 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in both matrices
@@ -371,25 +419,27 @@ class device_kernel_inplace_matrix_add {
     void operator()(::sycl::group<2> group) const {
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
             // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-            const std::size_t threadIdx_x = idx.get_local_id(0);
-            const std::size_t threadIdx_y = idx.get_local_id(1);
-            const std::size_t blockDim_x = idx.get_local_range(0);
-            const std::size_t blockDim_y = idx.get_local_range(1);
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-            // indices
-            const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-
-            for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
-                for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
-                    const std::size_t global_i = i + internal_i;
-                    const std::size_t global_j = j + internal_j;
-
-                    lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices used in the current work-item
+            const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+            const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                    lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx];  // SoA
                 }
             }
         });
@@ -411,6 +461,9 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in the matrix
@@ -433,25 +486,27 @@ class device_kernel_inplace_matrix_scale {
     void operator()(::sycl::group<2> group) const {
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
             // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-            const std::size_t threadIdx_x = idx.get_local_id(0);
-            const std::size_t threadIdx_y = idx.get_local_id(1);
-            const std::size_t blockDim_x = idx.get_local_range(0);
-            const std::size_t blockDim_y = idx.get_local_range(1);
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-            // indices
-            const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-
-            for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
-                for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
-                    const std::size_t global_i = i + internal_i;
-                    const std::size_t global_j = j + internal_j;
-
-                    lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices used in the current work-item
+            const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+            const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                    lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_;  // SoA
                 }
             }
         });
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
index b09fef0f8..e6afac623 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
@@ -13,12 +13,15 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
 
+#include <array>    // std::array
 #include <cstddef>  // std::size_t
 #include <tuple>    // std::tuple, std::make_tuple
 
@@ -27,19 +30,23 @@ namespace plssvm::sycl::detail::hierarchical {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] kernel_matrix_d the calculated kernel matrix
-     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[out] kernel_matrix the calculated kernel matrix
+     * @param[in] data the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] q the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -48,19 +55,19 @@ class device_kernel_assembly {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        kernel_matrix_d_{ kernel_matrix_d },
-        data_d_{ data_d },
+    device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        kernel_matrix_{ kernel_matrix },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         q_{ q },
         QA_cost_{ QA_cost },
         cost_{ cost },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } {
     }
 
     /**
@@ -68,67 +75,56 @@ class device_kernel_assembly {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> i{ group };
-        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> j{ group };
-        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
-
-        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
-
-        // initialize private and local variables
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        // create two local memory arrays used for caching
+        real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+        // create a private memory array used for internal caching
+        ::sycl::private_memory<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>, 2> temp{ group };
 
+        // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
+        if (group[1] >= group[0]) {
             // initialize private temp matrix to zero
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                    }
                 }
-            }
-        });
-
-        // implicit group barrier
+            });
 
-        // exploit symmetry
-        if (group[1] >= group[0]) {
-            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
-                // load data into shared memory
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                // load data into local memory
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    const std::size_t threadIdx_x = idx.get_local_id(0);
+                    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                    const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                    const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                    const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                    const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                    const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                    const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+                    // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+                    const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rows - device_row_offset
+                    const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                        const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                        data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                        data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                        data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                        data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // store the values in the local memory
+                        data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                        data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                     }
                 });
 
@@ -136,14 +132,30 @@ class device_kernel_assembly {
 
                 // perform the feature reduction calculation
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                             data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                   data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
+                                temp(idx)[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                 data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
                             }
                         }
                     }
@@ -154,26 +166,40 @@ class device_kernel_assembly {
 
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+                // calculate the indices used in the current work-item
+                const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+                const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        // calculate the indices to access the kernel matrix (the part stored on the current device)
-                        const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                        const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                        const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                        const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
-
-                        // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                        if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                        // calculate the indices to access the global data and the data with respect to the current device
+                        const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                        const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                        const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                        // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                        if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
                             real_type temp_ij = temp(idx)[internal_i][internal_j];
-                            temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                            // apply the final kernel function
+                            temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                             // apply the cost on the diagonal
-                            if (global_i == global_j) {
+                            if (global_i_idx == global_j_idx) {
                                 temp_ij += cost_;
                             }
-                            // update the kernel matrix
-                            kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                            // update the upper triangular kernel matrix
+                            kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                         }
                     }
                 }
@@ -183,11 +209,11 @@ class device_kernel_assembly {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *kernel_matrix_d_;
-    const real_type *data_d_;
+    real_type *kernel_matrix_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type *q_;
     const real_type QA_cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
index 9e8500d73..1334e566d 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
 
@@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::scoped {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -41,11 +48,11 @@ class device_kernel_symm {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -62,92 +69,111 @@ class device_kernel_symm {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
+                                   // the indices used in the current work-item
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // A_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // B_cache
 
-                                       for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
-                                           // load data into shared memory
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &A_cache, auto &B_cache, auto &temp) {
+                                       // iterate over all values using blocking to be able to cache them for faster memory accesses
+                                       for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                           // load data into local memory
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                               // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                               constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                               constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                               const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                               const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                               const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                               const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                               const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                               // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                               const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rhs
+                                               const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
                                                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                   const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                   const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                   const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                                                   // store the values in the local memory
                                                    // determine on which side of the diagonal we are located
-                                                   if (dim + threadIdx_x < global_j) {
-                                                       A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                                   if (dim_block + threadIdx_x < global_j_idx_linear) {
+                                                       A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                                                    } else {
-                                                       A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
-                                                   }
-                                                   // determine on which side of the diagonal we are located
-                                                   if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) {
-                                                       A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }];
-                                                   } else {
-                                                       A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                                       A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                                                    }
 
-                                                   B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                                                   B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
                                                }
                                            });
 
-                                           // perform calculations
+                                           // perform the dot product calculation
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                               if constexpr (target == target_platform::cpu) {
+                                                   // perform the dot product calculation, the dim is the fastest moving index
                                                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                           temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           real_type sum{ 0.0 };
+                                                           for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                                               sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           }
+                                                           temp(idx)[internal_i][internal_j] += sum;
+                                                       }
+                                                   }
+                                               } else {
+                                                   // perform the dot product calculation, the dim is the slowest moving index
+                                                   for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                                       for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                           for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                               temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           }
                                                        }
                                                    }
                                                }
                                            });
                                        }
 
+                                       // apply the (partial) BLAS operation and update C
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                           const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+                                           const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
                                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                   const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                                                   const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                                                   const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
-
-                                                   // be sure to not perform out of bounds accesses
-                                                   if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
-                                                       C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   // calculate the indices to access the global data and the data with respect to the current device
+                                                   const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                                   const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                                                   const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                                                   // be sure to not perform out-of-bounds accesses
+                                                   if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) {
+                                                       C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                                                    }
                                                }
                                            }
@@ -159,8 +185,8 @@ class device_kernel_symm {
     /// @cond Doxygen_suppress
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -175,16 +201,21 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  *          Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm_mirror {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
      * @param[in] num_mirror_rows the number of rows to mirror down
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -193,12 +224,12 @@ class device_kernel_symm_mirror {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -215,83 +246,105 @@ class device_kernel_symm_mirror {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
+                                   // the indices used in the current work-item
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // A_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // B_cache
 
-                                       for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
-                                           // load data into shared memory
+                                   // create a private memory array used for internal caching
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &A_cache, auto &B_cache, auto &temp) {
+                                       // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+                                       for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                           // load data into local memory
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                               // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                               constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                               constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                               const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                               const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                               const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                               const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                               const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
 
-                                               for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                   const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                   const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                               // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                               const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                               const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
 
-                                                   // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                                                   A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
-                                                   A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j];
+                                               for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                   // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                   const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                   B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                                                   B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   // store the values in the local memory
+                                                   A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                                                   B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                                // SoA
                                                }
                                            });
 
-                                           // perform calculations
+                                           // perform the dot product calculation
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                               if constexpr (target == target_platform::cpu) {
+                                                   // perform the dot product calculation, the dim is the fastest moving index
                                                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                           temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           real_type sum{ 0.0 };
+                                                           for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                                               sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           }
+                                                           temp(idx)[internal_i][internal_j] += sum;
+                                                       }
+                                                   }
+                                               } else {
+                                                   // perform the dot product calculation, the dim is the slowest moving index
+                                                   for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                                       for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                           for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                               temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           }
                                                        }
                                                    }
                                                }
                                            });
                                        }
 
+                                       // apply the (remaining) BLAS operation and update C
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices to access the global data
+                                           const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
 
                                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                   const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                                                   const auto partial_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                                                   const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast<std::size_t>(internal_j);
-
-                                                   // be sure to not perform out of bounds accesses
-                                                   if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
-                                                       C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   // calculate the indices to access the global data and the data with respect to the current device
+                                                   const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                                   const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                                                   const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx;
+
+                                                   // be sure to not perform out-of-bounds accesses
+                                                   if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) {
+                                                       C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                                                    }
                                                }
                                            }
@@ -304,8 +357,8 @@ class device_kernel_symm_mirror {
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
     const std::size_t num_mirror_rows_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -322,6 +375,9 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in both matrices
@@ -346,28 +402,29 @@ class device_kernel_inplace_matrix_add {
     void operator()(T group) const {
         ::sycl::memory_environment(group,
                                    [&]() {
-                                       // scale
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-                                           // indices
-                                           const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices used in the current work-item
+                                           const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+                                           const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
                                            for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
                                                for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
-                                                   const std::size_t global_i = i + internal_i;
-                                                   const std::size_t global_j = j + internal_j;
+                                                   // calculate the indices to access the global data
+                                                   const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                                   const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                                                   lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+                                                   lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx];  // SoA
                                                }
                                            }
                                        });
@@ -390,6 +447,9 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in the matrix
@@ -414,28 +474,29 @@ class device_kernel_inplace_matrix_scale {
     void operator()(T group) const {
         ::sycl::memory_environment(group,
                                    [&]() {
-                                       // scale
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-                                           // indices
-                                           const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                           for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
-                                               for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
-                                                   const std::size_t global_i = i + internal_i;
-                                                   const std::size_t global_j = j + internal_j;
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices used in the current work-item
+                                           const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+                                           const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+
+                                           for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                               for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                   // calculate the indices to access the global data
+                                                   const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                                   const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                                                   lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+                                                   lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_;  // SoA
                                                }
                                            }
                                        });
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
index 4ed3764ce..c2fcc5df6 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
@@ -13,12 +13,15 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
 
+#include <array>    // std::array
 #include <cstddef>  // std::size_t
 #include <tuple>    // std::tuple, std::make_tuple
 
@@ -27,19 +30,23 @@ namespace plssvm::sycl::detail::scoped {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] kernel_matrix_d the calculated kernel matrix
-     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[out] kernel_matrix the calculated kernel matrix
+     * @param[in] data the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] q the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -48,19 +55,19 @@ class device_kernel_assembly {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        kernel_matrix_d_{ kernel_matrix_d },
-        data_d_{ data_d },
+    device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        kernel_matrix_{ kernel_matrix },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         q_{ q },
         QA_cost_{ QA_cost },
         cost_{ cost },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } {
     }
 
     /**
@@ -71,94 +78,118 @@ class device_kernel_assembly {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
-
-                                       // exploit symmetry
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // data_i_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // data_j_cache
+
+                                   // create a private memory array used for internal caching
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),  // temp
+                                   [&](auto &data_i_cache, auto &data_j_cache, auto &temp) {
+                                       // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
                                        if (group[1] >= group[0]) {
-                                           for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
-                                               // load data into shared memory
+                                           // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                           for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                               // load data into local memory
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                   // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                   constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                                   constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                   constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                   const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                                   const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                                   const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                                   const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                                   const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                                   const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
 
-                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                   // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+                                                   const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rows - device_row_offset
+                                                   const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
                                                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                       const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                       const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                                                       // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                                                       data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                                                       data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                                                       data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                                                       data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                                                       // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                       const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                       const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                       // store the values in the local memory
+                                                       data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                                                       data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                                                    }
                                                });
 
-                                               // perform calculations
+                                               // perform the feature reduction calculation
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                                   if constexpr (target == target_platform::cpu) {
+                                                       // perform the feature reduction calculation, the feature is the fastest moving index
                                                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                               temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                                                            data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                               real_type sum{ 0.0 };
+                                                               for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                                   sum += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                  data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                               }
+                                                               temp(idx)[internal_i][internal_j] += sum;
+                                                           }
+                                                       }
+                                                   } else {
+                                                       // perform the feature reduction calculation, the feature is the slowest moving index
+                                                       for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                           for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                               for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                                   temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                                                data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                               }
                                                            }
                                                        }
                                                    }
                                                });
                                            }
 
+                                           // apply the remaining part of the kernel function and store the value in the output kernel matrix
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                               // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                               constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                               constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                               const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                               const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                               const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                               const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                               const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                               // calculate the indices used in the current work-item
+                                               const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+                                               const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
                                                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                       // calculate the indices to access the kernel matrix (the part stored on the current device)
-                                                       const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                                                       const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                                                       const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                                                       const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
-
-                                                       // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                                                       if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                                                       // calculate the indices to access the global data and the data with respect to the current device
+                                                       const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                                       const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                                                       const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                                                       const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                                                       // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                                                       if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
                                                            real_type temp_ij = temp(idx)[internal_i][internal_j];
-                                                           temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                                                           // apply the final kernel function
+                                                           temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                                                            // apply the cost on the diagonal
-                                                           if (global_i == global_j) {
+                                                           if (global_i_idx == global_j_idx) {
                                                                temp_ij += cost_;
                                                            }
-                                                           // update the kernel matrix
-                                                           kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                                                           // update the upper triangular kernel matrix
+                                                           kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                                                        }
                                                    }
                                                }
@@ -169,11 +200,11 @@ class device_kernel_assembly {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *kernel_matrix_d_;
-    const real_type *data_d_;
+    real_type *kernel_matrix_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type *q_;
     const real_type QA_cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
index ae07f7ec6..b179cbabe 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
@@ -24,16 +26,21 @@ namespace plssvm::sycl::detail::work_group {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -42,13 +49,13 @@ class device_kernel_symm {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        A_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        B_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+    device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        A_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        B_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -67,72 +74,85 @@ class device_kernel_symm {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_uz) {
-            // load data into local memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+        {
+            // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rhs
+            const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
+
+            // iterate over all values using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the local memory
+                    // determine on which side of the diagonal we are located
+                    if (dim_block + threadIdx_x < global_j_idx_linear) {
+                        A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                    } else {
+                        A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                    }
 
-                // determine on which side of the diagonal we are located
-                if (dim + threadIdx_x < global_j) {
-                    A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];
-                } else {
-                    A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
                 }
-                // determine on which side of the diagonal we are located
-                if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) {
-                    A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }];
+                nd_idx.barrier();  // wait until all work-items loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
                 } else {
-                    A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
-                }
-
-                B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-            }
-            nd_idx.barrier();  // wait until all work-items loaded their part of the data
-
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                        }
                     }
                 }
+                nd_idx.barrier();  // wait until all work-items performed their part of the calculations
             }
-            nd_idx.barrier();  // wait until all work-items performed their part of the calculations
         }
 
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
         // apply the (partial) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -147,8 +167,8 @@ class device_kernel_symm {
     /// @cond Doxygen_suppress
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -163,17 +183,22 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  *          Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm_mirror {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
      * @param[in] num_mirror_rows the number of rows to mirror down
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -182,14 +207,14 @@ class device_kernel_symm_mirror {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        A_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        B_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+    device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        A_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        B_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -208,63 +233,79 @@ class device_kernel_symm_mirror {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += FEATURE_BLOCK_SIZE_uz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
-                A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j];
-
-                B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-            }
-            nd_idx.barrier();  // wait until all threads loaded their part of the data
-
-            // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rhs
+            const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_mirror_rows
+
+            // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the local memory
+                    A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                    B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                                // SoA
+                }
+                nd_idx.barrier();  // wait until all work-items loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                        }
                     }
                 }
+                nd_idx.barrier();  // wait until all work-items performed their part of the calculations
             }
-            nd_idx.barrier();  // wait until all threads performed their part of the calculations
         }
 
+        // calculate the indices used in the current work-item
+        const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
+
         // apply the (remaining) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto partial_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -280,8 +321,8 @@ class device_kernel_symm_mirror {
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
     const std::size_t num_mirror_rows_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -298,6 +339,9 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in both matrices
@@ -319,25 +363,27 @@ class device_kernel_inplace_matrix_add {
      */
     void operator()(::sycl::nd_item<2> nd_idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // # num_rows
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // # num_rhs
+        const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx];  // SoA
             }
         }
     }
@@ -358,6 +404,9 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in the matrix
@@ -379,25 +428,27 @@ class device_kernel_inplace_matrix_scale {
      */
     void operator()(::sycl::nd_item<2> nd_idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // # num_rows
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // # num_rhs
+        const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_;  // SoA
             }
         }
     }
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
index 96030fbe7..b4b836b14 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
@@ -13,9 +13,11 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
@@ -27,20 +29,24 @@ namespace plssvm::sycl::detail::work_group {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
-     * @param[out] kernel_matrix_d the calculated kernel matrix
-     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[out] kernel_matrix the calculated kernel matrix
+     * @param[in] data the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] q the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -49,21 +55,21 @@ class device_kernel_assembly {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(::sycl::handler &cgh, real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        data_cache_i_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        data_cache_j_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        kernel_matrix_d_{ kernel_matrix_d },
-        data_d_{ data_d },
+    device_kernel_assembly(::sycl::handler &cgh, real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        data_i_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        data_j_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        kernel_matrix_{ kernel_matrix },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         q_{ q },
         QA_cost_{ QA_cost },
         cost_{ cost },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } {
     }
 
     /**
@@ -76,74 +82,92 @@ class device_kernel_assembly {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-        // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
         if (blockIdx_y >= blockIdx_x) {
-            // create a work-item private array used for internal caching
+            // create a private memory array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) {
-                // load data into local memory
-                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                    data_cache_i_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                    data_cache_i_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                    data_cache_j_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                    data_cache_j_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                }
-                nd_idx.barrier();  // wait until all work-items loaded their part of the data
-
-                // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                    data_cache_j_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+            {
+                // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+                const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rows - device_row_offset
+                const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
+
+                // iterate over all features using blocking to be able to cache them for faster memory accesses
+                for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                    // load data into local memory
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // store the values in the local memory
+                        data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                        data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                    }
+                    nd_idx.barrier();  // wait until all work-items loaded their part of the data
+
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                   data_j_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                            data_j_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
+                            }
                         }
                     }
+                    nd_idx.barrier();  // wait until all work-items performed their part of the calculations
                 }
-                nd_idx.barrier();  // wait until all work-items performed their part of the calculations
             }
 
+            // calculate the indices used in the current work-item
+            const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+            const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    // calculate the indices to access the kernel matrix (the part stored on the current device)
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
                         real_type temp_ij = temp[internal_i][internal_j];
-                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        // apply the final kernel function
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp_ij += cost_;
                         }
-                        // update the kernel matrix
-                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                        // update the upper triangular kernel matrix
+                        kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                     }
                 }
             }
@@ -152,16 +176,16 @@ class device_kernel_assembly {
 
   private:
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_i_;
+    ::sycl::local_accessor<real_type, 2> data_i_cache_;
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_j_;
+    ::sycl::local_accessor<real_type, 2> data_j_cache_;
 
     /// @cond Doxygen_suppress
-    real_type *kernel_matrix_d_;
-    const real_type *data_d_;
+    real_type *kernel_matrix_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type *q_;
     const real_type QA_cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
index 7b517a7b1..1a8c71c1d 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
@@ -13,10 +13,12 @@
 #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::item
 
@@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::basic {
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] alpha the scalar alpha value
      * @param[in] q the vector used in the dimensional reduction
-     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] data the data points to calculate the implicit kernel matrix from
      * @param[in] num_rows the total number of data points (= total number of rows)
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @param[in] cost the cost factor the diagonal is scaled with
@@ -52,13 +58,13 @@ class device_kernel_assembly_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
         alpha_{ alpha },
         q_{ q },
-        data_d_{ data_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         QA_cost_{ QA_cost },
         cost_{ cost },
@@ -67,7 +73,7 @@ class device_kernel_assembly_symm {
         num_classes_{ num_classes },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -75,28 +81,53 @@ class device_kernel_assembly_symm {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
         // only calculate the upper triangular matrix
-        if (i >= j) {
+        if (i_idx >= j_idx) {
             // create a work-item private array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; ++dim) {
-                // perform the feature reduction calculation
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i],
-                                                                                                data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]);
+            //*************************************************************************//
+            //                   inplace kernel matrix construction                    //
+            //*************************************************************************//
+            // iterate over all features using blocking
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the feature reduction calculation, the feature is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                sum += detail::feature_reduce<kernel_function>(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                               data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the feature reduction calculation, the feature is the slowest moving index
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
+
+                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                                        data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                            }
+                        }
                     }
                 }
             }
@@ -104,28 +135,48 @@ class device_kernel_assembly_symm {
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
-                        real_type temp_ij = temp[internal_i][internal_j];
-                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
+                        // apply the final kernel function
+                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
-                            temp_ij += cost_;
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) {
-                                detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx];
+                        if (global_i_idx == global_j_idx) {
+                            temp[internal_i][internal_j] += cost_;
+                        }
+                    } else {
+                        // be sure to set the value to zero otherwise
+                        temp[internal_i][internal_j] = real_type{ 0.0 };
+                    }
+                }
+            }
+
+            //*************************************************************************//
+            //                     calculate C += alpha * temp * B                     //
+            //*************************************************************************//
+            for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
+
+                        if (global_i_idx == global_j_idx) {
+                            // only apply once to the diagonal
+                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                detail::atomic_op<real_type>{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx];
                             }
                         } else {
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) {
-                                detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx];
+                            // apply it for the upper and lower triangular matrix
+                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                detail::atomic_op<real_type>{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx];
                                 // symmetry
-                                detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx];
+                                detail::atomic_op<real_type>{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx];
                             }
                         }
                     }
@@ -137,11 +188,12 @@ class device_kernel_assembly_symm {
   private:
     /// @cond Doxygen_suppress
     const real_type alpha_;
+
     const real_type *q_;
-    const real_type *data_d_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type QA_cost_;
     const real_type cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
index 1a24024b6..08ed85c0c 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
@@ -13,10 +13,12 @@
 #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
 
@@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::hierarchical {
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] alpha the scalar alpha value
      * @param[in] q the vector used in the dimensional reduction
-     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] data the data points to calculate the implicit kernel matrix from
      * @param[in] num_rows the total number of data points (= total number of rows)
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @param[in] cost the cost factor the diagonal is scaled with
@@ -52,13 +58,13 @@ class device_kernel_assembly_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
         alpha_{ alpha },
         q_{ q },
-        data_d_{ data_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         QA_cost_{ QA_cost },
         cost_{ cost },
@@ -67,37 +73,45 @@ class device_kernel_assembly_symm {
         num_classes_{ num_classes },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> i{ group };
-        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> j{ group };
-        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
+        // the indices used in the current work-item
+        ::sycl::private_memory<std::size_t, 2> i_idx{ group };  // num_rows - device_row_offset
+        ::sycl::private_memory<std::size_t, 2> j_idx{ group };  // device_num_rows
+
+        ::sycl::private_memory<std::size_t, 2> i_idx_linear{ group };  // num_rows - device_row_offset
+        ::sycl::private_memory<std::size_t, 2> j_idx_linear{ group };  // device_num_rows
+
+        // create two local memory arrays used for caching
+        real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
         // initialize private and local variables
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices to access the global data
+            i_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            j_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+            // calculate the indices to access the global data, pays attention to coalesced memory accesses
+            i_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            j_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
 
             // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
@@ -111,32 +125,36 @@ class device_kernel_assembly_symm {
 
         // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
         if (group[1] >= group[0]) {
+            //*************************************************************************//
+            //                   inplace kernel matrix construction                    //
+            //*************************************************************************//
             {
-                // allocate shared memory
-                real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-                real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+                // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto data_i_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+                auto data_j_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
                 // iterate over all features using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                     // load data into local memory
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        const std::size_t threadIdx_x = idx.get_local_id(0);
+                        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                        const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));  // current work-item in work-group x-dimension
 
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                            const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                            const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                            data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                            data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                            data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                            data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                            // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                            const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                            const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                            // store the values in the local memory
+                            data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                            data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                         }
                     });
 
@@ -144,14 +162,30 @@ class device_kernel_assembly_symm {
 
                     // perform the feature reduction calculation
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                        if constexpr (target == target_platform::cpu) {
+                            // perform the feature reduction calculation, the feature is the fastest moving index
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                    temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                                 data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                    real_type sum{ 0.0 };
+                                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                        sum += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                       data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                    }
+                                    temp(idx)[internal_i][internal_j] += sum;
+                                }
+                            }
+                        } else {
+                            // perform the feature reduction calculation, the feature is the slowest moving index
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                        temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                     data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                    }
                                 }
                             }
                         }
@@ -165,16 +199,18 @@ class device_kernel_assembly_symm {
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                        const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                        const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
-                        const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-
-                        // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                        if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
-                            temp(idx)[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        // calculate the indices to access the global data and the data with respect to the current device
+                        const auto device_global_i_idx = i_idx(idx) + static_cast<std::size_t>(internal_i);
+                        const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                        const auto device_global_j_idx = j_idx(idx) + static_cast<std::size_t>(internal_j);
+                        const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                        // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                        if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
+                            // apply the final kernel function
+                            temp(idx)[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                             // apply the cost on the diagonal
-                            if (global_i == global_j) {
+                            if (global_i_idx == global_j_idx) {
                                 temp(idx)[internal_i][internal_j] += cost_;
                             }
                         } else {
@@ -187,47 +223,51 @@ class device_kernel_assembly_symm {
 
             // implicit group barrier
 
-            // calculate C += alpha * temp * B for the UPPER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+            //*************************************************************************//
             {
-                // allocate shared memory
-                real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
-                real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
+                // reinterpret the local memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                auto B_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_one);
+                auto C_out_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_two);
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                     // load data into local memory
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        const std::size_t threadIdx_x = idx.get_local_id(0);
+                        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                        const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));  // current work-item in work-group x-dimension
 
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                            const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                            // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                            const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                            B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                            B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
-                            C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 };
-                            C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                            // store the values in the local memory
+                            B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
+                            C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 };                                                                             // SoA
                         }
                     });
 
                     // implicit group barrier
 
-                    // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    // calculate intermediate results and store them in local memory
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                            // cast values to 32-bit unsigned int values to prevent implicit conversions
                             const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                             const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                    C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_1) % FEATURE_BLOCK_SIZE] +=
-                                        temp(idx)[internal_i][internal_j] * B_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_1) % FEATURE_BLOCK_SIZE];
+                                    C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_1) % THREAD_BLOCK_SIZE] +=
+                                        temp(idx)[internal_i][internal_j] * B_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_1) % THREAD_BLOCK_SIZE];
                                 }
                             }
                         });
@@ -235,20 +275,22 @@ class device_kernel_assembly_symm {
                         // implicit group barrier
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add the intermediate cached results to the C matrix
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        const std::size_t threadIdx_y = idx.get_local_id(1);
+                        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                        const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));  // current work-item in work-group y-dimension
 
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                            const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal);
-                            detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1];
-                            detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1 + THREAD_BLOCK_SIZE];
+                            // calculate the indices to access the global data
+                            const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast<std::size_t>(internal);
+
+                            detail::atomic_op<real_type>{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1];  // SoA
                         }
                     });
 
@@ -260,10 +302,11 @@ class device_kernel_assembly_symm {
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                        const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast<std::size_t>(internal_j);
 
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
                         }
                     }
@@ -272,47 +315,51 @@ class device_kernel_assembly_symm {
 
             // implicit group barrier
 
-            // calculate C += alpha * temp * B for the LOWER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+            //*************************************************************************//
             {
-                // allocate shared memory
-                real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-                real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+                // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto B_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+                auto C_out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        const std::size_t threadIdx_x = idx.get_local_id(0);
+                        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                        const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));  // current work-item in work-group x-dimension
 
                         // load data into local memory
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                            const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                            // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                            const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                            B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                            B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                            // store the values in the local memory
+                            B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
                             C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                            C_out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                         }
                     });
 
                     // implicit group barrier
 
-                    // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    // calculate intermediate results and store them in local memory
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                            // cast values to 32-bit unsigned int values to prevent implicit conversions
                             const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                             const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                    C_out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
-                                        temp(idx)[internal_i][internal_j] * B_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
+                                    C_out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
+                                        temp(idx)[internal_i][internal_j] * B_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
                                 }
                             }
                         });
@@ -320,20 +367,22 @@ class device_kernel_assembly_symm {
                         // implicit group barrier
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add the intermediate cached results to the C matrix
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        const std::size_t threadIdx_x = idx.get_local_id(0);
+                        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                        const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));  // current work-item in work-group x-dimension
 
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                            const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal);
-                            detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
-                            detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1];
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast<std::size_t>(internal);
+
+                            detail::atomic_op<real_type>{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];  // SoA
                         }
                     });
 
@@ -347,10 +396,10 @@ class device_kernel_assembly_symm {
     /// @cond Doxygen_suppress
     const real_type alpha_;
     const real_type *q_;
-    const real_type *data_d_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type QA_cost_;
     const real_type cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
index 4391f2f19..d7593084b 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
@@ -13,10 +13,12 @@
 #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
 
@@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::scoped {
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] alpha the scalar alpha value
      * @param[in] q the vector used in the dimensional reduction
-     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] data the data points to calculate the implicit kernel matrix from
      * @param[in] num_rows the total number of data points (= total number of rows)
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @param[in] cost the cost factor the diagonal is scaled with
@@ -52,13 +58,13 @@ class device_kernel_assembly_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
         alpha_{ alpha },
         q_{ q },
-        data_d_{ data_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         QA_cost_{ QA_cost },
         cost_{ cost },
@@ -67,7 +73,7 @@ class device_kernel_assembly_symm {
         num_classes_{ num_classes },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -77,88 +83,124 @@ class device_kernel_assembly_symm {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
+                                   // the indices used in the current work-item
+                                   ::sycl::require_private_mem<std::size_t>(),  // num_rows - device_row_offset
+                                   ::sycl::require_private_mem<std::size_t>(),  // device_num_rows
+
+                                   ::sycl::require_private_mem<std::size_t>(),  // num_rows - device_row_offset
+                                   ::sycl::require_private_mem<std::size_t>(),  // device_num_rows
+
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // cache_one
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // cache_two
+
+                                   // create a private memory array used for internal caching
                                    ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
+                                   [&](auto &i_idx, auto &j_idx, auto &i_idx_linear, auto &j_idx_linear, auto &cache_one, auto &cache_two, auto &temp) {
                                        // initialize private and local variables
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices to access the global data
+                                           i_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           j_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                           i_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           j_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
                                        });
 
-                                       // exploit symmetry
+                                       // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
                                        if (group[1] >= group[0]) {
-                                           // iterate over all features using blocking to be able to cache them for faster memory accesses
-                                           for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
-                                               // load data into local memory
-                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
-                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
-
-                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
-
-                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                       const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                       const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                                                       // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                                                       data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                                                       data_cache_i[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                                                       data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                                                       data_cache_j[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                                                   }
-                                               });
-
-                                               // perform the feature reduction calculation
-                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
-                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
-
-                                                   for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                                                       for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                                                           for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                               temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                                                            data_cache_j[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                           //*************************************************************************//
+                                           //                   inplace kernel matrix construction                    //
+                                           //*************************************************************************//
+                                           {
+                                               // rename cached arrays
+                                               auto &data_i_cache = cache_one;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                                               auto &data_j_cache = cache_two;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+
+                                               // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                               for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                                   // load data into local memory
+                                                   ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
+                                                       const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                       const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                       // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                       constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                       constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                       const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));  // current work-item in work-group x-dimension
+
+                                                       for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                           // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                           const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                           const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                           // store the values in the local memory
+                                                           data_i_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                                                           data_j_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                                                       }
+                                                   });
+
+                                                   // perform the feature reduction calculation
+                                                   ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
+                                                       const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                       const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                       if constexpr (target == target_platform::cpu) {
+                                                           // perform the feature reduction calculation, the feature is the fastest moving index
+                                                           for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                               for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                                   real_type sum{ 0.0 };
+                                                                   for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                                       sum += detail::feature_reduce<kernel_function>(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                      data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                                   }
+                                                                   temp(idx)[internal_i][internal_j] += sum;
+                                                               }
+                                                           }
+                                                       } else {
+                                                           // perform the feature reduction calculation, the feature is the slowest moving index
+                                                           for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                               for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                                   for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                                       temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                                                    data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                                   }
+                                                               }
                                                            }
                                                        }
-                                                   }
-                                               });
+                                                   });
+                                               }
                                            }
 
                                            // apply the remaining part of the kernel function and store the value in the output kernel matrix
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                       const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                                                       const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                                                       const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
-                                                       const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-
-                                                       // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                                                       if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
-                                                           temp(idx)[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                                                       // calculate the indices to access the global data and the data with respect to the current device
+                                                       const auto device_global_i_idx = i_idx(idx) + static_cast<std::size_t>(internal_i);
+                                                       const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                                                       const auto device_global_j_idx = j_idx(idx) + static_cast<std::size_t>(internal_j);
+                                                       const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                                                       // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                                                       if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
+                                                           // apply the final kernel function
+                                                           temp(idx)[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                                                            // apply the cost on the diagonal
-                                                           if (global_i == global_j) {
+                                                           if (global_i_idx == global_j_idx) {
                                                                temp(idx)[internal_i][internal_j] += cost_;
                                                            }
                                                        } else {
@@ -169,64 +211,70 @@ class device_kernel_assembly_symm {
                                                }
                                            });
 
-                                           // calculate C += alpha * temp * B for the UPPER triangular matrix
+                                           //*************************************************************************//
+                                           //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+                                           //*************************************************************************//
                                            {
                                                // rename cached arrays
-                                               auto &B_cache = data_cache_i;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
-                                               auto &C_out_cache = data_cache_j;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
+                                               auto &B_cache = cache_one;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                                               auto &C_out_cache = cache_two;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
 
                                                // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                                               for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                               for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                    // load data into local memory
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                       const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                       // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                       constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                       constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                       const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));  // current work-item in work-group x-dimension
 
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                           const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                           // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                           const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                           // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                                                           B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                                                           B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
-                                                           C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };
-                                                           C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                                                           // store the values in the local memory
+                                                           B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
+                                                           C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };                                                                             // SoA
                                                        }
                                                    });
 
-                                                   // calculate intermediate results and store them in shared memory
-                                                   for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                                                   // calculate intermediate results and store them in local memory
+                                                   for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                           // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                            const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                            const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
                                                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                                   C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] +=
-                                                                       temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE];
+                                                                   C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE] +=
+                                                                       temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE];
                                                                }
                                                            }
                                                        });
                                                    }
 
-                                                   // add intermediate cached results to C
+                                                   // atomically add the intermediate cached results to the C matrix
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                       const std::size_t threadIdx_y = idx.get_local_id(group, 1);
+                                                       // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                       constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                       const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));  // current work-item in work-group y-dimension
 
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                           const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal);
-                                                           detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1];
-                                                           detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE];
+                                                           // calculate the indices to access the global data
+                                                           const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast<std::size_t>(internal);
+
+                                                           detail::atomic_op<real_type>{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1];  // SoA
                                                        }
                                                    });
                                                }
@@ -236,82 +284,83 @@ class device_kernel_assembly_symm {
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                       const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                                                       const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+                                                       // calculate the indices to access the global data
+                                                       const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast<std::size_t>(internal_i);
+                                                       const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast<std::size_t>(internal_j);
 
-                                                       if (global_i == global_j) {
+                                                       if (global_i_idx == global_j_idx) {
                                                            temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
                                                        }
                                                    }
                                                }
                                            });
 
-                                           // calculate C += alpha * temp * B for the LOWER triangular matrix
+                                           //*************************************************************************//
+                                           //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+                                           //*************************************************************************//
                                            {
-                                               // allocate shared memory
-                                               auto &B_cache = data_cache_i;      // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-                                               auto &C_out_cache = data_cache_j;  // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                                               // rename local memory
+                                               auto &B_cache = cache_one;      // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                                               auto &C_out_cache = cache_two;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
 
                                                // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                                               for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                               for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                       const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                       // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                       constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                       constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                       const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));  // current work-item in work-group x-dimension
 
                                                        // load data into local memory
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                           const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                           // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                           const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                           // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                                                           B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                                                           B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                                                           // store the values in the local memory
+                                                           B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
                                                            C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                                                           C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                                                        }
                                                    });
 
-                                                   // implicit group barrier
-
-                                                   // calculate intermediate results and store them in shared memory
-                                                   for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                                                   // calculate intermediate results and store them in local memory
+                                                   for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                           // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                            const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                            const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
                                                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                                   C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
-                                                                       temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
+                                                                   C_out_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
+                                                                       temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
                                                                }
                                                            }
                                                        });
-
-                                                       // implicit group barrier
                                                    }
 
-                                                   // add intermediate cached results to C
+                                                   // atomically add the intermediate cached results to the C matrix
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                       const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                       // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                       constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                       const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));  // current work-item in work-group x-dimension
 
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                           const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal);
-                                                           detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
-                                                           detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
+                                                           // calculate the indices to access the global data
+                                                           const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast<std::size_t>(internal);
+
+                                                           detail::atomic_op<real_type>{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];  // SoA
                                                        }
                                                    });
-
-                                                   // implicit group barrier
                                                }
                                            }
                                        }
@@ -322,10 +371,10 @@ class device_kernel_assembly_symm {
     /// @cond Doxygen_suppress
     const real_type alpha_;
     const real_type *q_;
-    const real_type *data_d_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type QA_cost_;
     const real_type cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
index 34b55fff4..015268fa2 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
@@ -13,10 +13,12 @@
 #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
@@ -28,21 +30,25 @@ namespace plssvm::sycl::detail::work_group {
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
      * @param[in] alpha the scalar alpha value
      * @param[in] q the vector used in the dimensional reduction
-     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] data the data points to calculate the implicit kernel matrix from
      * @param[in] num_rows the total number of data points (= total number of rows)
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @param[in] cost the cost factor the diagonal is scaled with
@@ -53,15 +59,15 @@ class device_kernel_assembly_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        data_cache_i_{ ::sycl::range<1>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-        data_cache_j_{ ::sycl::range<1>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+    device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        cache_one_{ ::sycl::range<1>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        cache_two_{ ::sycl::range<1>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
         alpha_{ alpha },
         q_{ q },
-        data_d_{ data_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         QA_cost_{ QA_cost },
         cost_{ cost },
@@ -70,7 +76,7 @@ class device_kernel_assembly_symm {
         num_classes_{ num_classes },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -82,50 +88,72 @@ class device_kernel_assembly_symm {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rows - device_row_offset
+        const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
         // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
         if (blockIdx_y >= blockIdx_x) {
             // create a work-item private array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
+            //*************************************************************************//
+            //                   inplace kernel matrix construction                    //
+            //*************************************************************************//
             {
+                // rename cached arrays
+                auto &data_i_cache = cache_one_;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto &data_j_cache = cache_two_;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+
                 // iterate over all features using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) {
+                for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                        const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                        data_cache_i_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                        data_cache_i_[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                        data_cache_j_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                        data_cache_j_[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // store the values in the local memory
+                        data_i_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                        data_j_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-                    // perform the feature reduction calculation
-                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                        data_cache_j_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                   data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                            data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
                             }
                         }
                     }
@@ -136,16 +164,18 @@ class device_kernel_assembly_symm {
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
-                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
+                        // apply the final kernel function
+                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp[internal_i][internal_j] += cost_;
                         }
                     } else {
@@ -155,42 +185,44 @@ class device_kernel_assembly_symm {
                 }
             }
 
-            // calculate C += alpha * temp * B for the UPPER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+            //*************************************************************************//
             {
                 // rename cached arrays
-                auto &B_cache = data_cache_i_;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
-                auto &C_out_cache = data_cache_j_;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
+                auto &B_cache = cache_one_;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                auto &C_out_cache = cache_two_;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) {
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const std::size_t global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                        B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                        B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
-                        C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };
-                        C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                        // store the values in the local memory
+                        B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
+                        C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };                                                                             // SoA
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-                    // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    // calculate intermediate results and store them in local memory
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] +=
-                                    temp[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE];
+                                C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE] +=
+                                    temp[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE];
                             }
                         }
                         nd_idx.barrier();  // wait until all work-items performed their part of the calculations
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add the intermediate cached results to the C matrix
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal);
-                        detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1];
-                        detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE];
+                        // calculate the indices to access the global data
+                        const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal);
+
+                        detail::atomic_op<real_type>{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1];  // SoA
                     }
                     nd_idx.barrier();  // wai until all work-items updated C with their values
                 }
@@ -199,51 +231,55 @@ class device_kernel_assembly_symm {
             // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
 
-                    if (global_i == global_j) {
+                    // update the diagonal
+                    if (global_i_idx == global_j_idx) {
                         temp[internal_i][internal_j] = real_type{ 0.0 };
                     }
                 }
             }
 
-            // calculate C += alpha * temp * B for the LOWER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+            //*************************************************************************//
             {
                 // rename cached arrays
-                auto &B_cache = data_cache_i_;      // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-                auto &C_out_cache = data_cache_j_;  // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto &B_cache = cache_one_;      // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto &C_out_cache = cache_two_;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) {
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                        B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                        B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                        // store the values in the local memory
+                        B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
                         C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                        C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-                    // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    // calculate intermediate results and store them in local memory
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
-                                    temp[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
+                                C_out_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
+                                    temp[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
                             }
                         }
                         nd_idx.barrier();  // wait until all work-items performed their part of the calculations
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add the intermediate cached results to the C matrix
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal);
-                        detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
-                        detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal);
+
+                        detail::atomic_op<real_type>{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];  // SoA
                     }
                     nd_idx.barrier();  // wait until all threads updated C with their values
                 }
@@ -253,17 +289,17 @@ class device_kernel_assembly_symm {
 
   private:
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 1> data_cache_i_;
+    ::sycl::local_accessor<real_type, 1> cache_one_;
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 1> data_cache_j_;
+    ::sycl::local_accessor<real_type, 1> cache_two_;
 
     /// @cond Doxygen_suppress
     const real_type alpha_;
     const real_type *q_;
-    const real_type *data_d_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type QA_cost_;
     const real_type cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp b/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp
index 97c5c6248..6cfa159bc 100644
--- a/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp
@@ -30,42 +30,17 @@ namespace plssvm::sycl::detail {
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] inline real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
index c16965cb1..9e838a89c 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
@@ -13,10 +13,12 @@
 #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_BASIC_PREDICT_KERNEL_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::item
 
@@ -26,31 +28,36 @@
 namespace plssvm::sycl::detail::basic {
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_w_linear {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in,out] w_d the vector to speedup the linear prediction
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] sv_d the support vectors
+     * @param[in,out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
-     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
-     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        w_d_{ w_d },
-        alpha_d_{ alpha_d },
-        sv_d_{ sv_d },
+    device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        w_{ w },
+        alpha_{ alpha },
+        support_vectors_{ support_vectors },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
-        device_specific_num_sv_{ device_specific_num_sv },
-        sv_offset_{ sv_offset },
+        device_num_sv_{ device_num_sv },
+        device_sv_offset_{ device_sv_offset },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset } { }
 
@@ -60,77 +67,106 @@ class device_kernel_w_linear {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+        const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t sv = 0; sv < device_specific_num_sv_; ++sv) {
-            // perform the dot product calculation
+        // iterate over all support vectors using blocking
+        for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) {
+            if constexpr (target == target_platform::cpu) {
+                // perform the dot product calculation, the sv is the fastest moving index
+                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        // calculate the indices to access the global data
+                        const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                        const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                            sum += alpha_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv_block + sv + device_sv_offset_] *  // AoS
+                                   support_vectors_[global_feature_idx * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + sv];    // SoA
+                        }
+                        temp[internal_feature][internal_class] += sum;
+                    }
+                }
+            } else {
+                // perform the dot product calculation, the sv is the slowest moving index
+                for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                            const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+
+                            temp[internal_feature][internal_class] += alpha_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv_block + sv + device_sv_offset_] *  // AoS
+                                                                      support_vectors_[global_feature_idx * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + sv];    // SoA
+                        }
+                    }
+                }
+            }
+
+            // update the global w-vector with the locally cached values
             for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                 for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                    // calculate the indices to access the global data
                     const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                    temp[internal_feature][internal_class] += alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_] * sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv];
+                    w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
                 }
             }
         }
-
-        // update global array with local one
-        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-                const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
-
-                w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];
-            }
-        }
     }
 
   private:
     /// @cond Doxygen_suppress
-    real_type *w_d_;
-    const real_type *alpha_d_;
-    const real_type *sv_d_;
+    real_type *w_;
+    const real_type *alpha_;
+    const real_type *support_vectors_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
-    const std::size_t device_specific_num_sv_;
-    const std::size_t sv_offset_;
+    const std::size_t device_num_sv_;
+    const std::size_t device_sv_offset_;
     const std::size_t grid_x_offset_;
     const std::size_t grid_y_offset_;
     /// @endcond
 };
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_predict_linear {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] prediction_d the predicted values
-     * @param[in] w_d the vector to speedup the calculations
-     * @param[in] rho_d the previously learned bias
-     * @param[in] predict_points_d the data points to predict
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_predict_points the number of data points to predict
      * @param[in] num_features the number of features per data point
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        prediction_d_{ prediction_d },
-        w_d_{ w_d },
-        rho_d_{ rho_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        prediction_{ prediction },
+        w_{ w },
+        rho_{ rho },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
@@ -143,46 +179,70 @@ class device_kernel_predict_linear {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+        const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; ++dim) {
-            // perform the dot product calculation
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
-                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-
-                    temp[internal_pd][internal_class] += w_d_[dim * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+        // iterate over all features using blocking
+        for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+            if constexpr (target == target_platform::cpu) {
+                // perform the dot product calculation, the feature is the fastest moving index
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        // calculate the indices to access the global data
+                        const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                        const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            sum += w_[(feature_block + feature) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] *                  // SoA
+                                   predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];  // SoA
+                        }
+                        temp[internal_pp][internal_class] += sum;
+                    }
+                }
+            } else {
+                // perform the dot product calculation, the feature is the slowest moving index
+                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                            temp[internal_pp][internal_class] += w_[(feature_block + feature) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] *                  // SoA
+                                                                 predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];  // SoA
+                        }
+                    }
                 }
             }
         }
 
-        // update global array with local one
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        // update the global array with the local one
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                // calculate the indices to access the global data
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
                 const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
 
-                prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx];
+                prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx];  // AoS
             }
         }
     }
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *w_d_;
-    const real_type *rho_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *w_;
+    const real_type *rho_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_predict_points_;
     const std::size_t num_features_;
@@ -192,21 +252,25 @@ class device_kernel_predict_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in] prediction_d the predicted values
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] rho_d the previously learned biases
-     * @param[in] sv_d the support vectors
-     * @param[in] predict_points_d the data points to predict
+     * @param[in] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned biases
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
      * @param[in] num_predict_points the number of data points to predict
@@ -215,19 +279,19 @@ class device_kernel_predict {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        prediction_d_{ prediction_d },
-        alpha_d_{ alpha_d },
-        rho_d_{ rho_d },
-        sv_d_{ sv_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        prediction_{ prediction },
+        alpha_{ alpha },
+        rho_{ rho },
+        support_vectors_{ support_vectors },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -235,54 +299,83 @@ class device_kernel_predict {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+        const auto sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_support_vectors
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; ++dim) {
-            // perform the feature reduction calculation
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
-                    const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
-
-                    temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(sv_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx],
-                                                                                              predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]);
+        // iterate over all features using blocking
+        for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+            if constexpr (target == target_platform::cpu) {
+                // perform the feature reduction calculation, the feature is the fastest moving index
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                        // calculate the indices to access the global data
+                        const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                        const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            sum += detail::feature_reduce<kernel_function>(support_vectors_[(feature_block + feature) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx],              // SoA
+                                                                           predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]);  // SoA
+                        }
+                        temp[internal_pp][internal_sv] += sum;
+                    }
+                }
+            } else {
+                // perform the feature reduction calculation, the feature is the slowest moving index
+                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                            // calculate the indices to access the global data
+                            const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                            const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+
+                            temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(support_vectors_[(feature_block + feature) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx],              // SoA
+                                                                                                      predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]);  // SoA
+                        }
+                    }
                 }
             }
         }
 
         // update temp using the respective kernel function
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter_);
+                temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter_);
             }
         }
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_classes_; ++dim) {
+        // iterate over all classes using blocking
+        for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
             if (sv_idx == 0) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
-                    detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += -rho_d_[dim];
+                for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        // calculate the index to access the global data
+                        const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+
+                        detail::atomic_op<real_type>{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += -rho_[class_block + class_idx];
+                    }
                 }
             }
 
-            // calculate intermediate results and store them in local memory
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            // atomically add the results to the prediction
+            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
                     const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                    detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } +=
-                        temp[internal_pd][internal_sv] * alpha_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                        detail::atomic_op<real_type>{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } +=
+                            temp[internal_pp][internal_sv] * alpha_[(class_block + class_idx) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    }
                 }
             }
         }
@@ -290,11 +383,11 @@ class device_kernel_predict {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *alpha_d_;
-    const real_type *rho_d_;
-    const real_type *sv_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *alpha_;
+    const real_type *rho_;
+    const real_type *support_vectors_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
     const std::size_t num_predict_points_;
diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
index 4098c4914..ea8bd5b6e 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
@@ -13,10 +13,12 @@
 #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_HIERARCHICAL_PREDICT_KERNEL_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
 
@@ -26,31 +28,36 @@
 namespace plssvm::sycl::detail::hierarchical {
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_w_linear {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in,out] w_d the vector to speedup the linear prediction
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] sv_d the support vectors
+     * @param[in,out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
-     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
-     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        w_d_{ w_d },
-        alpha_d_{ alpha_d },
-        sv_d_{ sv_d },
+    device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        w_{ w },
+        alpha_{ alpha },
+        support_vectors_{ support_vectors },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
-        device_specific_num_sv_{ device_specific_num_sv },
-        sv_offset_{ sv_offset },
+        device_num_sv_{ device_num_sv },
+        device_sv_offset_{ device_sv_offset },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset } { }
 
@@ -59,36 +66,15 @@ class device_kernel_w_linear {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> feature_idx{ group };
-        ::sycl::private_memory<std::size_t, 2> feature_idx_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> class_idx{ group };
-        ::sycl::private_memory<std::size_t, 2> class_idx_linear{ group };
+        // create two local memory arrays used for caching
+        real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
-        // initialize private and local variables
+        // initialize private temp matrix to zero
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
-            // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                     temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
@@ -99,23 +85,36 @@ class device_kernel_w_linear {
         // implicit group barrier
 
         // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) {
+        for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
             // load data into local memory
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                const std::size_t threadIdx_x = idx.get_local_id(0);
+                // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+                // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+                const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_features
+                const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;    // num_classes
 
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_feature_idx = feature_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x];  // SoA
-                    data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x];      // AoS
+                    feature_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x];  // SoA
+                    alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x];   // AoS
                 }
             });
 
@@ -123,13 +122,28 @@ class device_kernel_w_linear {
 
             // perform the dot product calculation
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the sv is the fastest moving index
                     for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                            temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                            real_type sum{ 0.0 };
+                            for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                                sum += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                            }
+                            temp(idx)[internal_feature][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the sv is the slowest moving index
+                    for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp(idx)[internal_feature][internal_class] += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                            }
                         }
                     }
                 }
@@ -138,16 +152,30 @@ class device_kernel_w_linear {
             // implicit group barrier
         }
 
-        // update global array with local one
+        // update the global w-vector with the locally cached values
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices used in the current work-item
+            const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+            const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
 
             for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                 for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
-                    const auto global_feature_idx = feature_idx(idx) + static_cast<std::size_t>(internal_feature);
+                    // calculate the indices to access the global data
+                    const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                    w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class];
+                    w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class];  // SoA
                 }
             }
         });
@@ -155,41 +183,46 @@ class device_kernel_w_linear {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *w_d_;
-    const real_type *alpha_d_;
-    const real_type *sv_d_;
+    real_type *w_;
+    const real_type *alpha_;
+    const real_type *support_vectors_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
-    const std::size_t device_specific_num_sv_;
-    const std::size_t sv_offset_;
+    const std::size_t device_num_sv_;
+    const std::size_t device_sv_offset_;
     const std::size_t grid_x_offset_;
     const std::size_t grid_y_offset_;
     /// @endcond
 };
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_predict_linear {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] prediction_d the predicted values
-     * @param[in] w_d the vector to speedup the calculations
-     * @param[in] rho_d the previously learned bias
-     * @param[in] predict_points_d the data points to predict
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_predict_points the number of data points to predict
      * @param[in] num_features the number of features per data point
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        prediction_d_{ prediction_d },
-        w_d_{ w_d },
-        rho_d_{ rho_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        prediction_{ prediction },
+        w_{ w },
+        rho_{ rho },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
@@ -201,35 +234,15 @@ class device_kernel_predict_linear {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> pp_idx{ group };
-        ::sycl::private_memory<std::size_t, 2> pp_idx_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> class_idx{ group };
-        ::sycl::private_memory<std::size_t, 2> class_idx_linear{ group };
+        // create two local memory arrays used for caching
+        real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
-        // initialize private and local variables
+        // initialize private variable
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
             // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -240,27 +253,38 @@ class device_kernel_predict_linear {
 
         // implicit group barrier
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                const std::size_t threadIdx_x = idx.get_local_id(0);
+                // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                // load data into shared memory
+                // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;     // num_predict_points
+                const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_classes
+
+                // load data into local memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                    data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                    data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                    data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
-                    data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the local memory
+                    pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                    w_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
                 }
             });
 
@@ -268,13 +292,28 @@ class device_kernel_predict_linear {
 
             // perform the dot product calculation
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                            temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
+                            real_type sum{ 0.0 };
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                sum += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                            }
+                            temp(idx)[internal_pp][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the feature is the slowest moving index
+                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp(idx)[internal_pp][internal_class] += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                            }
                         }
                     }
                 }
@@ -283,16 +322,30 @@ class device_kernel_predict_linear {
             // implicit group barrier
         }
 
-        // update global array with local one
+        // update the global array with the local one
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices used in the current work-item
+            const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+            const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
-                    const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal_pd);
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                    prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx];
+                    prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pp][internal_class] - rho_[global_class_idx];  // AoS
                 }
             }
         });
@@ -300,10 +353,10 @@ class device_kernel_predict_linear {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *w_d_;
-    const real_type *rho_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *w_;
+    const real_type *rho_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_predict_points_;
     const std::size_t num_features_;
@@ -313,21 +366,25 @@ class device_kernel_predict_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in] prediction_d the predicted values
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] rho_d the previously learned biases
-     * @param[in] sv_d the support vectors
-     * @param[in] predict_points_d the data points to predict
+     * @param[in] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned biases
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
      * @param[in] num_predict_points the number of data points to predict
@@ -336,51 +393,34 @@ class device_kernel_predict {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        prediction_d_{ prediction_d },
-        alpha_d_{ alpha_d },
-        rho_d_{ rho_d },
-        sv_d_{ sv_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        prediction_{ prediction },
+        alpha_{ alpha },
+        rho_{ rho },
+        support_vectors_{ support_vectors },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> pp_idx{ group };
-        ::sycl::private_memory<std::size_t, 2> pp_idx_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> sv_idx_linear{ group };
+        // create two local memory arrays used for caching
+        real_type cache_one[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type cache_two[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
-        // initialize private and local variables
+        // initialize private variable
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
             // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -392,27 +432,42 @@ class device_kernel_predict {
         // implicit group barrier
 
         {
+            // rename cached arrays -> not possible due to an AdaptiveCpp runtime exception
+            // auto &pp_cache = cache_one;
+            // auto &sv_cache = cache_two;
+
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    const std::size_t threadIdx_x = idx.get_local_id(0);
+                    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                    const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                    const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                    const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                    const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                    const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                    const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                    // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                    const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_predict_points
+                    const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
 
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                        const auto global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                        data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                        data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                        data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                        data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // store the values in the local memory
+                        cache_one[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];
+                        cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];
                     }
                 });
 
@@ -420,14 +475,30 @@ class device_kernel_predict {
 
                 // perform the feature reduction calculation
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                temp(idx)[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                               data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]);
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(cache_two[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                   cache_one[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                                }
+                                temp(idx)[internal_pp][internal_sv] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                    temp(idx)[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(cache_two[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                                   cache_one[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                                }
                             }
                         }
                     }
@@ -439,9 +510,9 @@ class device_kernel_predict {
 
         // update temp using the respective kernel function
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_);
+                    temp(idx)[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_pp][internal_sv], kernel_function_parameter_);
                 }
             }
         });
@@ -449,36 +520,42 @@ class device_kernel_predict {
         // implicit group barrier
 
         {
-            // rename cached arrays -> can't rename the arrays due to AdaptiveCpp runtime exception
-            // auto &alpha_cache = data_cache_pp;
-            // auto &out_cache = data_cache_sv;
+            // rename cached arrays -> not possible due to an AdaptiveCpp runtime exception
+            // auto &alpha_cache = cache_one;
+            // auto &out_cache = cache_two;
 
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+            // iterate over all classes using blocking to be able to cache them for faster memory accesses
+            for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                 // load data into local memory
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-                    const std::size_t threadIdx_x = idx.get_local_id(0);
+                    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                    const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                    const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                    const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                    const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
 
-                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                    const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
 
-                        data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                        data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                        // store the values in the local memory
+                        cache_one[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                         // the bias (rho) must only be applied once for all support vectors
                         if (blockIdx_x == std::size_t{ 0 }) {
-                            data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
-                            data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                            cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x];
                         } else {
-                            data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                            data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
+                            cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                         }
                     }
                 });
@@ -486,15 +563,16 @@ class device_kernel_predict {
                 // implicit group barrier
 
                 // calculate intermediate results and store them in local memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                data_cache_sv[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
-                                    temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                                cache_two[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] +=
+                                    temp(idx)[internal_pp][internal_sv] * cache_one[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
                             }
                         }
                     });
@@ -502,21 +580,29 @@ class device_kernel_predict {
                     // implicit group barrier
                 }
 
-                // add intermediate cached results to prediction_d
+                // atomically add the intermediate cached results to the prediction
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    const std::size_t threadIdx_x = idx.get_local_id(0);
+                    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                    const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                    const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                    const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                    const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                    // calculate the indices used in the current thread
+                    const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
 
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal);
+                        // calculate the indices to access the global data
+                        const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                        detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
-                        detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1];
+                        detail::atomic_op<real_type>{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
                     }
                 });
 
@@ -527,11 +613,11 @@ class device_kernel_predict {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *alpha_d_;
-    const real_type *rho_d_;
-    const real_type *sv_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *alpha_;
+    const real_type *rho_;
+    const real_type *support_vectors_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
     const std::size_t num_predict_points_;
diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
index 1a42161f5..e26025670 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
@@ -13,10 +13,12 @@
 #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_SCOPED_PREDICT_KERNEL_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
 
@@ -26,31 +28,36 @@
 namespace plssvm::sycl::detail::scoped {
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_w_linear {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in,out] w_d the vector to speedup the linear prediction
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] sv_d the support vectors
+     * @param[in,out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
-     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
-     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        w_d_{ w_d },
-        alpha_d_{ alpha_d },
-        sv_d_{ sv_d },
+    device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        w_{ w },
+        alpha_{ alpha },
+        support_vectors_{ support_vectors },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
-        device_specific_num_sv_{ device_specific_num_sv },
-        sv_offset_{ sv_offset },
+        device_num_sv_{ device_num_sv },
+        device_sv_offset_{ device_sv_offset },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset } { }
 
@@ -62,78 +69,101 @@ class device_kernel_w_linear {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &data_cache_feature, auto &data_cache_alpha, auto &feature_idx, auto &feature_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // feature_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // alpha_cache
 
+                                   // create a private memory array used for internal caching
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &feature_cache, auto &alpha_cache, auto &temp) {
                                        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-                                       for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) {
+                                       for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE) {
                                            // load data into local memory
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                               // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                               constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                               constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                               const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                               const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                               const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                               const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                               const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                               // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+                                               const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_features
+                                               const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;    // num_classes
 
                                                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                   const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                   const auto global_feature_idx = feature_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                   const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                   data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x];  // SoA
-                                                   data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x];      // AoS
+                                                   feature_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x];  // SoA
+                                                   alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x];   // AoS
                                                }
                                            });
 
                                            // perform the dot product calculation
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                               if constexpr (target == target_platform::cpu) {
+                                                   // perform the dot product calculation, the sv is the fastest moving index
                                                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                                                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                                                           temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                                                           real_type sum{ 0.0 };
+                                                           for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                                                               sum += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                                                           }
+                                                           temp(idx)[internal_feature][internal_class] += sum;
+                                                       }
+                                                   }
+                                               } else {
+                                                   // perform the dot product calculation, the sv is the fastest moving index
+                                                   for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                                                       for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                                                           for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                                               temp(idx)[internal_feature][internal_class] += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                                                           }
                                                        }
                                                    }
                                                }
                                            });
                                        }
 
-                                       // update global array with local one
+                                       // update the global w-vector with the locally cached values
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices used in the current thread
+                                           const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+                                           const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
 
                                            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                                                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                                                   const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
-                                                   const auto global_feature_idx = feature_idx(idx) + static_cast<std::size_t>(internal_feature);
+                                                   // calculate the indices to access the global data
+                                                   const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                                                   const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                                                   w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class];
+                                                   w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class];  // SoA
                                                }
                                            }
                                        });
@@ -142,41 +172,46 @@ class device_kernel_w_linear {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *w_d_;
-    const real_type *alpha_d_;
-    const real_type *sv_d_;
+    real_type *w_;
+    const real_type *alpha_;
+    const real_type *support_vectors_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
-    const std::size_t device_specific_num_sv_;
-    const std::size_t sv_offset_;
+    const std::size_t device_num_sv_;
+    const std::size_t device_sv_offset_;
     const std::size_t grid_x_offset_;
     const std::size_t grid_y_offset_;
     /// @endcond
 };
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_predict_linear {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] prediction_d the predicted values
-     * @param[in] w_d the vector to speedup the calculations
-     * @param[in] rho_d the previously learned bias
-     * @param[in] predict_points_d the data points to predict
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_predict_points the number of data points to predict
      * @param[in] num_features the number of features per data point
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        prediction_d_{ prediction_d },
-        w_d_{ w_d },
-        rho_d_{ rho_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        prediction_{ prediction },
+        w_{ w },
+        rho_{ rho },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
@@ -191,81 +226,102 @@ class device_kernel_predict_linear {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &data_cache_pp, auto &data_cache_w, auto &pp_idx, auto &pp_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // pp_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // w_cache
 
-                                       // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-                                       for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                   // create a private memory array used for internal caching
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &pp_cache, auto &w_cache, auto &temp) {
+                                       // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                       for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                               // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                               constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                               constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                               const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                               const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                               const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                               const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                               const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
 
-                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                               // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                                               const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;     // num_predict_points
+                                               const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_classes
 
-                                               // load data into shared memory
+                                               // load data into local memory
                                                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                   const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                   const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                                                   // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                                                   data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                                                   data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                                                   data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
-                                                   data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
+                                                   // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                   const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                   // store the values in the local memory
+                                                   pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                                                   w_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
                                                }
                                            });
 
                                            // perform the dot product calculation
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                                                   for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                               if constexpr (target == target_platform::cpu) {
+                                                   // perform the dot product calculation, the feature is the fastest moving index
+                                                   for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                                                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                                                           temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
+                                                           real_type sum{ 0.0 };
+                                                           for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                               sum += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                                                           }
+                                                           temp(idx)[internal_pp][internal_class] += sum;
+                                                       }
+                                                   }
+                                               } else {
+                                                   // perform the dot product calculation, the feature is the slowest moving index
+                                                   for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                       for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                                                           for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                                               temp(idx)[internal_pp][internal_class] += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                                                           }
                                                        }
                                                    }
                                                }
                                            });
                                        }
 
-                                       // update global array with local one
+                                       // update the global array with the local one
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-                                           for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices used in the current work-item
+                                           const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+                                           const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+                                           for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                                                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                                                   const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
-                                                   const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal_pd);
+                                                   // calculate the indices to access the global data
+                                                   const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                                                   const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                                                   prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx];
+                                                   prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pp][internal_class] - rho_[global_class_idx];  // AoS
                                                }
                                            }
                                        });
@@ -274,10 +330,10 @@ class device_kernel_predict_linear {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *w_d_;
-    const real_type *rho_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *w_;
+    const real_type *rho_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_predict_points_;
     const std::size_t num_features_;
@@ -287,21 +343,25 @@ class device_kernel_predict_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in] prediction_d the predicted values
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] rho_d the previously learned biases
-     * @param[in] sv_d the support vectors
-     * @param[in] predict_points_d the data points to predict
+     * @param[in] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned biases
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
      * @param[in] num_predict_points the number of data points to predict
@@ -310,19 +370,19 @@ class device_kernel_predict {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        prediction_d_{ prediction_d },
-        alpha_d_{ alpha_d },
-        rho_d_{ rho_d },
-        sv_d_{ sv_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        prediction_{ prediction },
+        alpha_{ alpha },
+        rho_{ rho },
+        support_vectors_{ support_vectors },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -332,143 +392,175 @@ class device_kernel_predict {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // cache_one
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // cache_two
+
+                                   // create a private memory array used for internal caching
                                    ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &data_cache_pp, auto &data_cache_sv, auto &pp_idx, auto &pp_idx_linear, auto &sv_idx_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
+                                   [&](auto &cache_one, auto &cache_two, auto &temp) {
+                                       {
+                                           // rename cached arrays
+                                           auto &pp_cache = cache_one;
+                                           auto &sv_cache = cache_two;
 
-                                       // iterate over all features using blocking to be able to cache them for faster memory accesses
-                                       for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
-                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
-                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+                                           // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                           for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
+                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                   // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                   constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                                   constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                   constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                   const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                                   const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                                   const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                                   const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                                   const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                                   const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
 
-                                               // load data into local memory
-                                               for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                   const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                   const auto global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                                                   // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                                                   data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                                                   data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                                                   data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                                                   data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                                               }
-                                           });
+                                                   // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                                                   const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_predict_points
+                                                   const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
 
-                                           // perform the feature reduction calculation
-                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
-                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+                                                   // load data into local memory
+                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                       // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                       const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                       const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                       // store the values in the local memory
+                                                       pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];
+                                                       sv_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];
+                                                   }
+                                               });
+
+                                               // perform the feature reduction calculation
+                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
+                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                                                   for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                                                       for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                                           temp(idx)[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                                                          data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]);
+                                                   if constexpr (target == target_platform::cpu) {
+                                                       // perform the feature reduction calculation, the feature is the fastest moving index
+                                                       for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                                                           for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                                               real_type sum{ 0.0 };
+                                                               for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                                   sum += detail::feature_reduce<kernel_function>(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                                  pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                                                               }
+                                                               temp(idx)[internal_pp][internal_sv] += sum;
+                                                           }
+                                                       }
+                                                   } else {
+                                                       // perform the feature reduction calculation, the feature is the slowest moving index
+                                                       for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                           for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                                                               for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                                                   temp(idx)[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                                                                  pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                                                               }
+                                                           }
                                                        }
                                                    }
-                                               }
-                                           });
+                                               });
+                                           }
                                        }
 
                                        // update temp using the respective kernel function
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                           for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                                                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                                   temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_);
+                                                   temp(idx)[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_pp][internal_sv], kernel_function_parameter_);
                                                }
                                            }
                                        });
 
                                        {
                                            // rename cached arrays
-                                           auto &alpha_cache = data_cache_pp;
-                                           auto &out_cache = data_cache_sv;
+                                           auto &alpha_cache = cache_one;
+                                           auto &out_cache = cache_two;
 
-                                           // iterate over all features using blocking to be able to cache them for faster memory accesses
-                                           for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                           // iterate over all classes using blocking to be able to cache them for faster memory accesses
+                                           for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                // load data into local memory
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                   // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                   constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                                   constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                   constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                   const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                                   const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                                   const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                                   const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
 
-                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                       const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                                                   const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
 
-                                                       alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                                                       alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                       // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                       const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                                                       // store the values in the local memory
+                                                       alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                                                        // the bias (rho) must only be applied once for all support vectors
                                                        if (blockIdx_x == std::size_t{ 0 }) {
-                                                           out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
-                                                           out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                                                           out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x];
                                                        } else {
                                                            out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                                                           out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                                                        }
                                                    }
                                                });
 
                                                // calculate intermediate results and store them in local memory
-                                               for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                                               for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                       for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                                       for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                                                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                                               out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
-                                                                   temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                                                               out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] +=
+                                                                   temp(idx)[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
                                                            }
                                                        }
                                                    });
                                                }
 
-                                               // add intermediate cached results to prediction_d
+                                               // atomically add the intermediate cached results to the prediction
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                   // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                   constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                                   constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                   const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                                   const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                                   const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                                   const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
 
-                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                   // calculate the indices used in the current thread
+                                                   const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
 
                                                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                       const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal);
+                                                       // calculate the indices to access the global data
+                                                       const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                                                       detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
-                                                       detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1];
+                                                       detail::atomic_op<real_type>{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
                                                    }
                                                });
                                            }
@@ -478,11 +570,11 @@ class device_kernel_predict {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *alpha_d_;
-    const real_type *rho_d_;
-    const real_type *sv_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *alpha_;
+    const real_type *rho_;
+    const real_type *support_vectors_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
     const std::size_t num_predict_points_;
diff --git a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
index d451ac7d5..bef23d533 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
@@ -13,10 +13,12 @@
 #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_WORK_GROUP_PREDICT_KERNEL_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
@@ -26,34 +28,39 @@
 namespace plssvm::sycl::detail::work_group {
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_w_linear {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
-     * @param[in,out] w_d the vector to speedup the linear prediction
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] sv_d the support vectors
+     * @param[in,out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
-     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
-     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_w_linear(::sycl::handler &cgh, real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        data_cache_feature_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        data_cache_alpha_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        w_d_{ w_d },
-        alpha_d_{ alpha_d },
-        sv_d_{ sv_d },
+    device_kernel_w_linear(::sycl::handler &cgh, real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        feature_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        alpha_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        w_{ w },
+        alpha_{ alpha },
+        support_vectors_{ support_vectors },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
-        device_specific_num_sv_{ device_specific_num_sv },
-        sv_offset_{ sv_offset },
+        device_num_sv_{ device_num_sv },
+        device_sv_offset_{ device_sv_offset },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset } { }
 
@@ -67,104 +74,130 @@ class device_kernel_w_linear {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current work-item
-        const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) {
-            // load data into local memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                const auto global_feature_idx = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+        {
+            // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+            const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_features
+            const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;    // num_classes
 
-                data_cache_feature_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x];  // SoA
-                data_cache_alpha_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x];      // AoS
-            }
-            nd_idx.barrier();  // wait until all work-items loaded their part of the data
+            // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+            for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                        temp[internal_feature][internal_class] += data_cache_alpha_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                    // store the values in the local memory
+                    feature_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x];  // SoA
+                    alpha_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x];   // AoS
+                }
+                nd_idx.barrier();  // wait until all work-items loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the sv is the fastest moving index
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            real_type sum{ 0.0 };
+                            for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                                sum += alpha_cache_[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache_[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                            }
+                            temp[internal_feature][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the sv is the slowest moving index
+                    for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp[internal_feature][internal_class] += alpha_cache_[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache_[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                            }
+                        }
                     }
                 }
+                nd_idx.barrier();  // wait until all work-items performed their part of the calculations
             }
-            nd_idx.barrier();  // wait until all work-items performed their part of the calculations
         }
 
-        // update global array with local one
+        // calculate the indices used in the current work-item
+        const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+        const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
+
+        // update the global w-vector with the locally cached values
         for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                // calculate the indices to access the global data
                 const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];
+                w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
             }
         }
     }
 
   private:
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_feature_;
+    ::sycl::local_accessor<real_type, 2> feature_cache_;
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_alpha_;
+    ::sycl::local_accessor<real_type, 2> alpha_cache_;
 
     /// @cond Doxygen_suppress
-    real_type *w_d_;
-    const real_type *alpha_d_;
-    const real_type *sv_d_;
+    real_type *w_;
+    const real_type *alpha_;
+    const real_type *support_vectors_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
-    const std::size_t device_specific_num_sv_;
-    const std::size_t sv_offset_;
+    const std::size_t device_num_sv_;
+    const std::size_t device_sv_offset_;
     const std::size_t grid_x_offset_;
     const std::size_t grid_y_offset_;
     /// @endcond
 };
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_predict_linear {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
-     * @param[out] prediction_d the predicted values
-     * @param[in] w_d the vector to speedup the calculations
-     * @param[in] rho_d the previously learned bias
-     * @param[in] predict_points_d the data points to predict
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_predict_points the number of data points to predict
      * @param[in] num_features the number of features per data point
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        data_cache_pp_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        data_cache_w_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        prediction_d_{ prediction_d },
-        w_d_{ w_d },
-        rho_d_{ rho_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        pp_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        w_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        prediction_{ prediction },
+        w_{ w },
+        rho_{ rho },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
@@ -181,74 +214,91 @@ class device_kernel_predict_linear {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current work-item
-        const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                data_cache_pp_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                data_cache_w_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
-                data_cache_w_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
-            }
-            nd_idx.barrier();  // wait until all work-items loaded their part of the data
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;     // num_predict_points
+            const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_classes
 
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                        temp[internal_pd][internal_class] += data_cache_w_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the local memory
+                    pp_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                    w_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+                }
+                nd_idx.barrier();  // wait until all work-items loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            real_type sum{ 0.0 };
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                sum += w_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                            }
+                            temp[internal_pp][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the feature is the slowest moving index
+                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp[internal_pp][internal_class] += w_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                            }
+                        }
                     }
                 }
+                nd_idx.barrier();  // wait until all work-items performed their part of the calculations
             }
-            nd_idx.barrier();  // wait until all work-items performed their part of the calculations
         }
 
-        // update global array with local one
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        // calculate the indices used in the current work-item
+        const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+        const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+        // update the global array with the local one
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                // calculate the indices to access the global data
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
                 const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
 
-                prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx];
+                prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx];  // AoS
             }
         }
     }
 
   private:
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_pp_;
+    ::sycl::local_accessor<real_type, 2> pp_cache_;
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_w_;
+    ::sycl::local_accessor<real_type, 2> w_cache_;
 
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *w_d_;
-    const real_type *rho_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *w_;
+    const real_type *rho_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_predict_points_;
     const std::size_t num_features_;
@@ -258,22 +308,26 @@ class device_kernel_predict_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
-     * @param[in] prediction_d the predicted values
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] rho_d the previously learned biases
-     * @param[in] sv_d the support vectors
-     * @param[in] predict_points_d the data points to predict
+     * @param[in] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned biases
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
      * @param[in] num_predict_points the number of data points to predict
@@ -282,21 +336,21 @@ class device_kernel_predict {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_predict(::sycl::handler &cgh, real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        data_cache_pp_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        data_cache_sv_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        prediction_d_{ prediction_d },
-        alpha_d_{ alpha_d },
-        rho_d_{ rho_d },
-        sv_d_{ sv_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict(::sycl::handler &cgh, real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        cache_one_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        cache_two_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        prediction_{ prediction },
+        alpha_{ alpha },
+        rho_{ rho },
+        support_vectors_{ support_vectors },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -308,47 +362,63 @@ class device_kernel_predict {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current work-item
-        const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
         {
+            // rename cached arrays
+            auto &pp_cache = cache_one_;
+            auto &sv_cache = cache_two_;
+
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_predict_points
+            const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
+
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) {
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_sv_idx = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                    data_cache_pp_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                    data_cache_sv_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                    data_cache_sv_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the local memory
+                    pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                    sv_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];             // SoA
                 }
                 nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-                // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the feature reduction calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                      data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]);
+                            real_type sum{ 0.0 };
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                sum += detail::feature_reduce<kernel_function>(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                               pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                            }
+                            temp[internal_pp][internal_sv] += sum;
+                        }
+                    }
+                } else {
+                    // perform the feature reduction calculation, the feature is the slowest moving index
+                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                          pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                            }
                         }
                     }
                 }
@@ -357,54 +427,57 @@ class device_kernel_predict {
         }
 
         // update temp using the respective kernel function
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter_);
+                temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter_);
             }
         }
 
         {
             // rename cached arrays
-            auto &alpha_cache = data_cache_pp_;
-            auto &out_cache = data_cache_sv_;
+            auto &alpha_cache = cache_one_;
+            auto &out_cache = cache_two_;
 
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) {
+            // calculate the indices used in the current work-item
+            const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+            // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+            const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
+
+            // iterate over all classes using blocking to be able to cache them for faster memory accesses
+            for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const std::size_t global_sv_idx = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                    alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                    alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                    // store the values in the local memory
+                    alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                     // the bias (rho) must only be applied once for all support vectors
                     if (blockIdx_x == std::size_t{ 0 }) {
-                        out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
-                        out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
+                        out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x];
                     } else {
                         out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                        out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                     }
                 }
                 nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
                 // calculate intermediate results and store them in local memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
-                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
-                                temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                            out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] +=
+                                temp[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
                         }
                     }
                     nd_idx.barrier();  // wait until all work-items performed their part of the calculations
                 }
 
-                // add intermediate cached results to prediction_d
+                // atomically add the intermediate cached results to the prediction
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data
                     const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                    detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
-                    detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1];
+                    detail::atomic_op<real_type>{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
                 }
                 nd_idx.barrier();  // wait until all work-items updated their part of the prediction
             }
@@ -413,16 +486,16 @@ class device_kernel_predict {
 
   private:
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_pp_;
+    ::sycl::local_accessor<real_type, 2> cache_one_;
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_sv_;
+    ::sycl::local_accessor<real_type, 2> cache_two_;
 
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *alpha_d_;
-    const real_type *rho_d_;
-    const real_type *sv_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *alpha_;
+    const real_type *rho_;
+    const real_type *support_vectors_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
     const std::size_t num_predict_points_;
diff --git a/include/plssvm/backends/gpu_csvm.hpp b/include/plssvm/backends/gpu_csvm.hpp
index 520f665ea..b05d21ab5 100644
--- a/include/plssvm/backends/gpu_csvm.hpp
+++ b/include/plssvm/backends/gpu_csvm.hpp
@@ -143,12 +143,13 @@ class gpu_csvm : virtual public ::plssvm::csvm {
      * @param[in] device_id the device to run the kernel on
      * @param[in] exec the execution range used in the device call
      * @param[in] params the parameters (e.g., kernel function) used to assemble the kernel matrix
+     * @param[in] use_usm_allocations if `true` use USM allocations for the `cg_streaming` implementation
      * @param[in] data_d the data set to create the kernel matrix from
      * @param[in] q_red_d the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @return the explicit kernel matrix stored on the device (`[[nodiscard]]`)
      */
-    [[nodiscard]] virtual device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const = 0;
+    [[nodiscard]] virtual device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const = 0;
     /**
      * @brief Perform an explicit BLAS level 3 operation: `C = alpha * A * B + beta * C` where @p A, @p B, and @p C are matrices, and @p alpha and @p beta are scalars.
      * @param[in] device_id the device to run the kernel on
@@ -231,6 +232,7 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
     PLSSVM_ASSERT(!q_red.empty(), "The q_red vector must not be empty!");
     PLSSVM_ASSERT(q_red.size() == A.num_rows() - 1, "The q_red size ({}) mismatches the number of data points after dimensional reduction ({})!", q_red.size(), A.num_rows() - 1);
 
+    const bool use_usm_allocations = solver == solver_type::cg_streaming;
     const std::size_t num_devices = this->num_available_devices();
     const std::size_t num_rows_reduced = A.shape().x - 1;
 
@@ -296,9 +298,10 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
                 // unreachable
                 break;
             case solver_type::cg_explicit:
+            case solver_type::cg_streaming:
                 {
                     // explicitly assemble the (potential partial) kernel matrix
-                    device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, data_d[device_id], q_red_d[device_id], QA_cost);
+                    device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, use_usm_allocations, data_d[device_id], q_red_d[device_id], QA_cost);
                     kernel_matrices_parts[device_id] = ::plssvm::detail::move_only_any{ std::move(kernel_matrix) };
                 }
                 break;
@@ -389,6 +392,7 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
                 // unreachable
                 break;
             case solver_type::cg_explicit:
+            case solver_type::cg_streaming:
                 {
                     const auto &A_d = detail::move_only_any_cast<const device_ptr_type &>(A[device_id]);
                     PLSSVM_ASSERT(!A_d.empty(), "The A matrix must not be empty!");
diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp
index 78729691f..feb404b3d 100644
--- a/include/plssvm/backends/gpu_device_ptr.hpp
+++ b/include/plssvm/backends/gpu_device_ptr.hpp
@@ -19,8 +19,12 @@
 #include "plssvm/matrix.hpp"                 // plssvm::layout_type, plssvm::matrix
 #include "plssvm/shape.hpp"                  // plssvm::shape
 
-#include <cstddef>  // std::size_t
-#include <vector>   // std::vector
+#include "fmt/format.h"  // fmt::format
+
+#include <algorithm>  // std::min
+#include <cstddef>    // std::size_t
+#include <utility>    // std::swap, std::exchange
+#include <vector>     // std::vector
 
 namespace plssvm::detail {
 
@@ -56,23 +60,26 @@ class gpu_device_ptr {
      * @brief Construct a device_ptr for the device managed by @p queue with the extents { @p size, 1 }.
      * @param[in] size the size of the managed memory
      * @param[in] queue the queue (or similar) to manage the device_ptr
+     * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend
      */
-    gpu_device_ptr(size_type size, const queue_type queue);
+    gpu_device_ptr(size_type size, const queue_type queue, bool use_usm_allocations);
     /**
      * @brief Construct a device_ptr for the device managed by @p queue with the provided @p shape.
      * @details The managed memory size is: extents[0] * extents[1].
      * @param[in] shape the 2D size of the managed memory; size = shape.x * shape.y
      * @param[in] queue the queue (or similar) to manage the device_ptr
+     * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend
      */
-    gpu_device_ptr(plssvm::shape shape, const queue_type queue);
+    gpu_device_ptr(plssvm::shape shape, const queue_type queue, bool use_usm_allocations);
     /**
      * @brief Construct a device_ptr for the device managed by @p queue with the provided @p shape including @p padding.
      * @details The managed memory size is: (shape.x + padding.x) * (shape.y + padding.y).
      * @param[in] shape the extents of the managed memory
      * @param[in] padding the padding applied to the extents
      * @param[in] queue the queue (or similar) to manage the device_ptr
+     * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend
      */
-    gpu_device_ptr(plssvm::shape shape, plssvm::shape padding, const queue_type queue);
+    gpu_device_ptr(plssvm::shape shape, plssvm::shape padding, const queue_type queue, bool use_usm_allocations);
 
     /**
      * @brief Delete copy-constructor to make device_ptr a move only type.
@@ -228,7 +235,7 @@ class gpu_device_ptr {
      */
     void fill(value_type value, size_type pos = 0);
     /**
-     * @brief Fill up-to @p count values to @p value starting at position @p pos.
+     * @brief Fill up-to @p count values of @p value starting at position @p pos.
      * @details Fill `[pos, rcount)` where `rcount` is the smaller value of @p count and `device_ptr::size() - pos`.
      * @param[in] value the fill value
      * @param[in] pos the position to start the fill
@@ -368,30 +375,36 @@ class gpu_device_ptr {
     plssvm::shape padding_{};
     /// The device pointer pointing to the managed memory.
     device_pointer_type data_{};
+    /// If true, use USM allocations automatically migrating the data between host and device.
+    bool use_usm_allocations_{};
 };
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
-gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const size_type size, const queue_type queue) :
+gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const size_type size, const queue_type queue, const bool use_usm_allocations) :
     queue_{ queue },
-    shape_{ plssvm::shape{ size, 1 } } { }
+    shape_{ plssvm::shape{ size, 1 } },
+    use_usm_allocations_{ use_usm_allocations } { }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
-gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const plssvm::shape shape, const queue_type queue) :
+gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const plssvm::shape shape, const queue_type queue, const bool use_usm_allocations) :
     queue_{ queue },
-    shape_{ shape } { }
+    shape_{ shape },
+    use_usm_allocations_{ use_usm_allocations } { }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
-gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type queue) :
+gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type queue, const bool use_usm_allocations) :
     queue_{ queue },
     shape_{ shape },
-    padding_{ padding } { }
+    padding_{ padding },
+    use_usm_allocations_{ use_usm_allocations } { }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(gpu_device_ptr &&other) noexcept :
     queue_{ std::exchange(other.queue_, queue_type{}) },
     shape_{ std::exchange(other.shape_, plssvm::shape{}) },
     padding_{ std::exchange(other.padding_, plssvm::shape{}) },
-    data_{ std::exchange(other.data_, device_pointer_type{}) } { }
+    data_{ std::exchange(other.data_, device_pointer_type{}) },
+    use_usm_allocations_{ std::exchange(other.use_usm_allocations_, false) } { }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 auto gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::operator=(gpu_device_ptr &&other) noexcept -> gpu_device_ptr & {
@@ -401,6 +414,7 @@ auto gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::opera
         shape_ = std::exchange(other.shape_, plssvm::shape{});
         padding_ = std::exchange(other.padding_, plssvm::shape{});
         data_ = std::exchange(other.data_, device_pointer_type{});
+        use_usm_allocations_ = std::exchange(other.use_usm_allocations_, false);
     }
     return *this;
 }
@@ -411,6 +425,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::swap(
     std::swap(shape_, other.shape_);
     std::swap(padding_, other.padding_);
     std::swap(data_, other.data_);
+    std::swap(use_usm_allocations_, other.use_usm_allocations_);
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
diff --git a/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp
index 63e9f9831..a1dc4864a 100644
--- a/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp
@@ -13,10 +13,11 @@
 #define PLSSVM_BACKENDS_STDPAR_KERNEL_CG_EXPLICIT_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"      // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
-#include "plssvm/detail/assert.hpp"  // PLSSVM_ASSERT
-#include "plssvm/matrix.hpp"         // plssvm::soa_matrix
-#include "plssvm/shape.hpp"          // plssvm::shape
+#include "plssvm/constants.hpp"         // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/detail/assert.hpp"     // PLSSVM_ASSERT
+#include "plssvm/matrix.hpp"            // plssvm::soa_matrix
+#include "plssvm/shape.hpp"             // plssvm::shape
+#include "plssvm/target_platforms.hpp"  // plssvm::target_platform
 
 #include <algorithm>  // std::for_each
 #include <array>      // std::array
@@ -29,157 +30,218 @@
 namespace plssvm::stdpar::detail {
 
 /**
- * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a symmetric matrix (memory optimized), @p B and @p C are matrices, and @p alpha and @p beta are scalars.
- * @param[in] num_rows the number of rows in @p A and @p C
- * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
- * @param[in] alpha the scalar alpha value
- * @param[in] A the matrix @p A
- * @param[in] B the matrix @p B
- * @param[in] beta the scalar beta value
- * @param[in,out] C the matrix @p C, also used as result matrix
+ * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @tparam target the target platform
  */
-inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
-    PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
-
-    // calculate constants
-    const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_num_rhs * blocked_device_specific_num_rows);
-    std::iota(range.begin(), range.end(), 0);
-
-    std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t rhs = idx / blocked_device_specific_num_rows;
-        const std::size_t row = idx % blocked_device_specific_num_rows;
-
-        const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz;
-
-        // create a thread private array used for internal caching
-        std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-        // iterate over all features
-        for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) {
-            // perform the dot product calculation
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                    const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                    real_type A_val = 0.0;
-                    // determine on which side of the diagonal we are located
-                    if (dim < global_row) {
-                        A_val = A_ptr[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }];
-                    } else {
-                        A_val = A_ptr[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }];
+template <target_platform target>
+struct device_kernel_symm {
+    /**
+     * @brief Perform an explicit BLAS SYMM operation.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     */
+    void operator()(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+        PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
+        PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
+        PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
+        PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
+        PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
+
+        // calculate constants
+        const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> range(blocked_num_rhs * blocked_device_num_rows);
+        std::iota(range.begin(), range.end(), 0);
+
+        std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A, B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+
+            // create a thread private array used for internal caching
+            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+            // iterate over all values
+            for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                                real_type A_cache = 0.0;
+                                // determine on which side of the diagonal we are located
+                                if (dim_block + dim < global_j_idx) {
+                                    A_cache = A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                } else {
+                                    A_cache = A_ptr[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                }
+                                sum += A_cache * B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx];
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                                real_type A_cache = 0.0;
+                                // determine on which side of the diagonal we are located
+                                if (dim_block + dim < global_j_idx) {
+                                    A_cache = A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                } else {
+                                    A_cache = A_ptr[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                }
+                                temp[internal_i][internal_j] += A_cache * B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx];
+                            }
+                        }
                     }
-                    temp[internal_i][internal_j] += A_val * B_ptr[(dim + row_offset) * (num_rhs + PADDING_SIZE_uz) + global_rhs];
                 }
             }
-        }
-
-        // apply the (partial) BLAS operation and update C
-        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) {
-                    C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs];
+
+            // apply the (partial) BLAS operation and update C
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses
+                    if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                        C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];
+                    }
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
- * @param[in] num_rows the number of rows in @p A and @p C
- * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
- * @param[in] alpha the scalar alpha value
- * @param[in] A the matrix @p A
- * @param[in] B the matrix @p B
- * @param[in] beta the scalar beta value
- * @param[in,out] C the matrix @p C, also used as result matrix
+ * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
+ * @tparam target the target platform
  */
-inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
-    // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
-    PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
-
-    // calculate constants
-    const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_num_mirror_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_mirror_rows) / INTERNAL_BLOCK_SIZE));
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_num_rhs * blocked_num_mirror_rows);  // define range over which should be iterated
-    std::iota(range.begin(), range.end(), 0);
-
-    std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t rhs = idx / blocked_num_mirror_rows;
-        const std::size_t row = idx % blocked_num_mirror_rows;
-
-        const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz;
-
-        // create a thread private array used for internal caching
-        std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-        // iterate over all features
-        for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) {
-            // perform the dot product calculation
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                    const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                    const real_type A_val = A_ptr[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row];
-                    temp[internal_i][internal_j] += A_val * B_ptr[(dim + row_offset) * (num_rhs + PADDING_SIZE_uz) + global_rhs];
+template <target_platform target>
+struct device_kernel_symm_mirror {
+    /**
+     * @brief Perform an explicit BLAS SYMM operation.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] num_mirror_rows the number of rows to mirror down
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     */
+    void operator()(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+        // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
+        PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
+        PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
+        PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
+        PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
+        PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows);
+        PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
+
+        // calculate constants
+        const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_num_mirror_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_mirror_rows) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> range(blocked_num_rhs * blocked_num_mirror_rows);
+        std::iota(range.begin(), range.end(), 0);
+
+        std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A, B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t i_idx = (idx / blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t j_idx = (idx % blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz;
+
+            // create a thread private array used for internal caching
+            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+            // iterate over the remaining values
+            for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                                sum += A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                                       B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx];                                                                                                                        // SoA
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                                temp[internal_i][internal_j] += A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                                                                B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx];                                                                                                                        // SoA
+                            }
+                        }
+                    }
                 }
             }
-        }
-
-        // apply the (partial) BLAS operation and update C
-        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                const std::size_t partial_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) {
-                    C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs];
+
+            // apply the (remaining) BLAS operation and update C
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses
+                    if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                        C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];
+                    }
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 }  // namespace plssvm::stdpar::detail
 
diff --git a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 93772aab3..fdb869351 100644
--- a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -14,14 +14,15 @@
 #pragma once
 
 #include "plssvm/backends/stdpar/kernel/kernel_functions.hpp"  // plssvm::stdpar::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                            // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
-#include "plssvm/matrix.hpp"                                   // plssvm::aos_matrix
+#include "plssvm/matrix.hpp"                                   // plssvm::soa_matrix
+#include "plssvm/target_platforms.hpp"                         // plssvm::target_platform
 
 #include <algorithm>  // std::for_each
 #include <array>      // std::array
-#include <cmath>      // std::ceil, std::sqrt
+#include <cmath>      // std::ceil
 #include <cstddef>    // std::size_t
 #include <execution>  // std::execution::par_unseq
 #include <numeric>    // std::iota
@@ -30,88 +31,118 @@
 namespace plssvm::stdpar::detail {
 
 /**
- * @brief Assemble the kernel matrix using the @p kernel function.
- * @tparam kernel the compile-time kernel function to use
- * @tparam Args the types of the potential additional arguments for the @p kernel function
- * @param[out] kernel_matrix the resulting kernel matrix
- * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
- * @param[in] q the `q` vector
- * @param[in] QA_cost he bottom right matrix entry multiplied by cost
- * @param[in] cost 1 / the cost parameter in the C-SVM
- * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
+ * @brief Create the explicit kernel matrix using the @p kernel_function.
+ * @tparam target the target platform
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel, typename... Args>
-void device_kernel_assembly(std::vector<real_type> &kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
-    PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!");
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
-    PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
-
-    // calculate constants
-    const std::size_t num_rows = data.num_rows() - 1;
-    const std::size_t num_features = data.num_cols();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // count the number of entries in the final index list
-    std::vector<std::size_t> indices(blocked_row_range * blocked_device_specific_num_rows);  // define range over which should be iterated
-    std::iota(indices.begin(), indices.end(), 0);
-
-    std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), kernel_matrix_ptr = kernel_matrix.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-
-        // only calculate the upper triangular matrix
-        if (row_idx >= col_idx) {
-            // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs
-            // create a thread private array used for internal caching
-            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-            // iterate over all features
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
-                // perform the feature reduction calculation
-                for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                    for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                        const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                        const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                        temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_row], data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_col]);
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
+struct device_kernel_assembly {
+    /**
+     * @brief Assemble the kernel matrix using the specified kernel function.
+     * @param[out] kernel_matrix the resulting kernel matrix
+     * @param[in] data the data matrix
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
+     * @param[in] q the `q` vector
+     * @param[in] QA_cost he bottom right matrix entry multiplied by cost
+     * @param[in] cost 1 / the cost parameter in the C-SVM
+     * @param[in] kernel_function_parameter the potential additional arguments for the kernel function
+     */
+    void operator()(real_type *kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+        PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!");
+        PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
+        PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+        PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
+        PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
+
+        // calculate constants
+        const std::size_t num_rows = data.num_rows() - 1;
+        const std::size_t num_features = data.num_cols();
+        const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> indices(blocked_row_range * blocked_device_num_rows);
+        std::iota(indices.begin(), indices.end(), 0);
+
+        std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), kernel_matrix_ptr = kernel_matrix](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+
+            // only calculate the upper triangular matrix
+            if (i_idx >= j_idx) {
+                // create a thread private array used for internal caching
+                std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+                // iterate over all features
+                for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                    if constexpr (target != target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                                real_type sum{ 0.0 };
+                                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                   data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    // calculate the indices to access the global data
+                                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                                            data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                                }
+                            }
+                        }
                     }
                 }
-            }
 
-            // apply the remaining part of the kernel function and store the value in the output kernel matrix
-            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                    // calculate the indices to access the kernel matrix (the part stored on the current device)
-                    const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                        real_type temp_ij = temp[internal_row][internal_col];
-                        temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q_ptr[global_row] - q_ptr[global_col];
-                        // apply the cost on the diagonal
-                        if (global_row == global_col) {
-                            temp_ij += cost;
+                // apply the remaining part of the kernel function and store the value in the output kernel matrix
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data and the data with respect to the current device
+                        const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_i_idx = device_row_offset + device_global_i_idx;
+                        const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                        const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                        // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                        if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                            real_type temp_ij = temp[internal_i][internal_j];
+                            // apply the final kernel function
+                            temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q_ptr[global_i_idx] - q_ptr[global_j_idx];
+                            // apply the cost on the diagonal
+                            if (global_i_idx == global_j_idx) {
+                                temp_ij += cost;
+                            }
+                            // update the upper triangular kernel matrix
+                            kernel_matrix_ptr[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                         }
-                        kernel_matrix_ptr[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij;
                     }
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 }  // namespace plssvm::stdpar::detail
 
diff --git a/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index fdebd9cb5..8aaa10792 100644
--- a/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -18,8 +18,8 @@
 #include "plssvm/constants.hpp"                                // plssvm::real_type
 #include "plssvm/detail/assert.hpp"                            // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
-#include "plssvm/kernel_functions.hpp"                         // plssvm::kernel_function
-#include "plssvm/matrix.hpp"                                   // aos_matrix
+#include "plssvm/matrix.hpp"                                   // plssvm::soa_matrix
+#include "plssvm/target_platforms.hpp"                         // plssvm::target_platform
 
 #include <algorithm>  // std::for_each
 #include <array>      // std::array
@@ -32,100 +32,152 @@
 namespace plssvm::stdpar::detail {
 
 /**
- * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
- * @tparam kernel the compile-time kernel function to use
- * @tparam Args the types of the potential additional arguments for the @p kernel function
- * @param[in] alpha the scalar alpha value
- * @param[in] q the `q` vector
- * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
- * @param[in] QA_cost he bottom right matrix entry multiplied by cost
- * @param[in] cost 1 / the cost parameter in the C-SVM
- * @param[in] B the matrix @p B
- * @param[in,out] C the matrix @p C
- * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
+ * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @tparam target the target platform
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_assembly_symm(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
-    PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
-    PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
-    PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!");
-    PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size());
-
-    // calculate constants
-    const std::size_t num_rows = data.num_rows() - 1;
-    const std::size_t num_features = data.num_cols();
-    const std::size_t num_classes = B.num_rows();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // count the number of entries in the final index list
-    std::vector<std::size_t> indices(blocked_row_range * blocked_device_specific_num_rows);  // define range over which should be iterated
-    std::iota(indices.begin(), indices.end(), 0);
-
-    std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-
-        // only calculate the upper triangular matrix
-        if (row_idx >= col_idx) {
-            // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs
-            // create a thread private array used for internal caching
-            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-            // iterate over all features
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
-                for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                    for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                        const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                        const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                        temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_row], data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_col]);
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
+struct device_kernel_assembly_symm {
+    /**
+     * @brief Perform an implicit BLAS SYMM-like operation.
+     * @param[in] alpha the scalar alpha value
+     * @param[in] q the `q` vector
+     * @param[in] data the data matrix
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
+     * @param[in] QA_cost he bottom right matrix entry multiplied by cost
+     * @param[in] cost 1 / the cost parameter in the C-SVM
+     * @param[in] B the matrix @p B
+     * @param[in,out] C the matrix @p C
+     * @param[in] kernel_function_parameter the potential additional arguments for the kernel function
+     */
+    void operator()(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
+        PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
+        PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+        PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
+        PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
+        PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!");
+        PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size());
+
+        // calculate constants
+        const std::size_t num_rows = data.num_rows() - 1;
+        const std::size_t num_features = data.num_cols();
+        const std::size_t num_classes = B.num_rows();
+        const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> indices(blocked_row_range * blocked_device_num_rows);
+        std::iota(indices.begin(), indices.end(), 0);
+
+        std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+
+            // only calculate the upper triangular matrix
+            if (i_idx >= j_idx) {
+                // create a thread private array used for internal caching
+                std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+                //*************************************************************************//
+                //                   inplace kernel matrix construction                    //
+                //*************************************************************************//
+                // iterate over all features
+                for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                                real_type sum{ 0.0 };
+                                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                   data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    // calculate the indices to access the global data
+                                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                                            data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                                }
+                            }
+                        }
                     }
                 }
-            }
 
-            // apply the remaining part of the kernel function and store the value in the output kernel matrix
-            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                    const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                        real_type temp_ij = temp[internal_row][internal_col];
-                        temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q_ptr[global_row] - q_ptr[global_col];
-                        // apply the cost on the diagonal
-                        if (global_row == global_col) {
-                            temp_ij += cost;
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
-                                atomic_ref<real_type>{ C_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx] } += alpha * temp_ij * B_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx];
+                // apply the remaining part of the kernel function and store the value in the output kernel matrix
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data and the data with respect to the current device
+                        const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_i_idx = device_row_offset + device_global_i_idx;
+                        const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                        const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                        // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                        if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                            // apply the final kernel function
+                            temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q_ptr[global_i_idx] - q_ptr[global_j_idx];
+                            // apply the cost on the diagonal
+                            if (global_i_idx == global_j_idx) {
+                                temp[internal_i][internal_j] += cost;
                             }
                         } else {
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
-                                atomic_ref<real_type>{ C_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx] } += alpha * temp_ij * B_ptr[global_col * (num_classes + PADDING_SIZE_uz) + class_idx];
-                                // symmetry
-                                atomic_ref<real_type>{ C_ptr[global_col * (num_classes + PADDING_SIZE_uz) + class_idx] } += alpha * temp_ij * B_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx];
+                            // be sure to set the value to zero otherwise
+                            temp[internal_i][internal_j] = real_type{ 0.0 };
+                        }
+                    }
+                }
+
+                //*************************************************************************//
+                //                     calculate C += alpha * temp * B                     //
+                //*************************************************************************//
+                for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                            if (global_i_idx == global_j_idx) {
+                                // only apply once to the diagonal
+                                for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                    atomic_ref<real_type>{ C_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx] } += alpha * temp[internal_i][internal_j] * B_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx];
+                                }
+                            } else {
+                                // apply it for the upper and lower triangular matrix
+                                for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                    atomic_ref<real_type>{ C_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx] } += alpha * temp[internal_i][internal_j] * B_ptr[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx];
+                                    // symmetry
+                                    atomic_ref<real_type>{ C_ptr[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx] } += alpha * temp[internal_i][internal_j] * B_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx];
+                                }
                             }
                         }
                     }
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 }  // namespace plssvm::stdpar::detail
 
diff --git a/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp b/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp
index e652d1160..b77e7a338 100644
--- a/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp
@@ -38,42 +38,17 @@ namespace plssvm::stdpar::detail {
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] inline PLSSVM_STDPAR_KERNEL_FUNCTION real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp b/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp
index ce46e6a1c..4b487dce2 100644
--- a/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp
@@ -15,15 +15,16 @@
 
 #include "plssvm/backends/stdpar/detail/utility.hpp"           // plssvm::stdpar::detail::atomic_ref
 #include "plssvm/backends/stdpar/kernel/kernel_functions.hpp"  // plssvm::stdpar::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                            // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                                   // plssvm::aos_matrix, plssvm::soa_matrix
 #include "plssvm/shape.hpp"                                    // plssvm::shape
+#include "plssvm/target_platforms.hpp"                         // plssvm::target_platform
 
 #include <algorithm>  // std::for_each
 #include <array>      // std::array
-#include <cmath>      // std::fma
+#include <cmath>      // std::ceil
 #include <cstddef>    // std::size_t
 #include <execution>  // std::execution::par_unseq
 #include <numeric>    // std::iota
@@ -33,230 +34,305 @@ namespace plssvm::stdpar::detail {
 
 /**
  * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
- * @param[out] w the vector to speedup the linear prediction
- * @param[in] alpha the previously learned weights
- * @param[in] support_vectors the support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for
+ * @tparam target the target platform
  */
-inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) {
-    PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
-    PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }));
-    PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows());
-    PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows());
-
-    // calculate constants
-    const std::size_t num_classes = alpha.num_rows();
-    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_features = support_vectors.num_cols();
-    const auto blocked_num_features = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_features) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_support_vectors = support_vectors.num_rows();
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_num_classes * blocked_num_features);
-    std::iota(range.begin(), range.end(), 0);
-
-    std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, w_ptr = w.data(), alpha_ptr = alpha.data(), sv_ptr = support_vectors.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t feature = idx / blocked_num_classes;
-        const std::size_t c = idx % blocked_num_classes;
-
-        const std::size_t feature_idx = feature * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz;
-
-        // create a thread private array used for internal caching
-        std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-        // iterate over all features
-        for (std::size_t sv = 0; sv < device_specific_num_sv; ++sv) {
-            // perform the feature reduction calculation
-            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const std::size_t global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
-                    const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-
-                    temp[internal_feature][internal_class] += alpha_ptr[global_class_idx * (num_support_vectors + PADDING_SIZE_uz) + sv_offset + sv] * sv_ptr[global_feature_idx * (num_support_vectors + PADDING_SIZE_uz) + sv_offset + sv];
+template <target_platform target>
+struct device_kernel_w_linear {
+    /**
+     * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
+     * @param[out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for
+     */
+    void operator()(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) {
+        PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
+        PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }));
+        PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows());
+        PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows());
+
+        // calculate constants
+        const std::size_t num_classes = alpha.num_rows();
+        const std::size_t num_features = support_vectors.num_cols();
+        const std::size_t num_sv = support_vectors.num_rows();
+        const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_num_features = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_features) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> range(blocked_num_classes * blocked_num_features);
+        std::iota(range.begin(), range.end(), 0);
+
+        std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, w_ptr = w.data(), alpha_ptr = alpha.data(), support_vectors_ptr = support_vectors.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t feature_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+
+            // create a thread private array used for internal caching
+            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+            // iterate over all support vectors using blocking
+            for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the sv is the fastest moving index
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                                sum += alpha_ptr[global_class_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv] *             // AoS
+                                       support_vectors_ptr[global_feature_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv];  // SoA
+                            }
+                            temp[internal_feature][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the sv is the slowest moving index
+                    for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                // calculate the indices to access the global data
+                                const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                                temp[internal_feature][internal_class] += alpha_ptr[global_class_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv] *             // AoS
+                                                                          support_vectors_ptr[global_feature_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv];  // SoA
+                            }
+                        }
+                    }
                 }
             }
-        }
 
-        // update global array with local one
-        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const std::size_t global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
-                const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+            // store the result back to the w vector
+            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                    // calculate the indices to access the global data
+                    const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                w_ptr[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];
+                    w_ptr[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];
+                }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
- * @param[out] prediction the predicted values
- * @param[in] w the vector to speedup the calculations
- * @param[in] rho the previously learned bias
- * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
+ * @tparam target the target platform
  */
-inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) {
-    PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size());
-    PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols());
-    PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
-
-    // calculate constants
-    const std::size_t num_predict_points = predict_points.num_rows();
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_classes = prediction.num_cols();
-    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_features = predict_points.num_cols();
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_device_specific_num_predict_points * blocked_num_classes);
-    std::iota(range.begin(), range.end(), 0);
-
-    std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), w_ptr = w.data(), rho_ptr = rho.data(), pp_ptr = predict_points.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t pp = idx / blocked_num_classes;
-        const std::size_t c = idx % blocked_num_classes;
-
-        const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz;
-
-        // create a thread private array used for internal caching
-        std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-        // iterate over all features
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
-            // perform the feature reduction calculation
-            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-
-                    temp[internal_pp][internal_class] += w_ptr[dim * (num_classes + PADDING_SIZE_uz) + global_class_idx] * pp_ptr[dim * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx];
+template <target_platform target>
+struct device_kernel_predict_linear {
+    /**
+     * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
+     * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+     * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
+     */
+    void operator()(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) {
+        PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size());
+        PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols());
+        PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() }));
+        PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+        PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
+
+        // calculate constants
+        const std::size_t num_predict_points = predict_points.num_rows();
+        const std::size_t num_classes = prediction.num_cols();
+        const std::size_t num_features = predict_points.num_cols();
+        const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> range(blocked_device_num_predict_points * blocked_num_classes);
+        std::iota(range.begin(), range.end(), 0);
+
+        std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), w_ptr = w.data(), rho_ptr = rho.data(), predict_points_ptr = predict_points.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t pp_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+
+            // create a thread private array used for internal caching
+            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+            // iterate over all features using blocking
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                sum += w_ptr[(feature_block + feature) * (num_classes + PADDING_SIZE_uz) + global_class_idx] *                  // SoA
+                                       predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx];  // SoA
+                            }
+                            temp[internal_pp][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the feature is the slowest moving index
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                // calculate the indices to access the global data
+                                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                                temp[internal_pp][internal_class] += w_ptr[(feature_block + feature) * (num_classes + PADDING_SIZE_uz) + global_class_idx] *                  // SoA
+                                                                     predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx];  // SoA
+                            }
+                        }
+                    }
                 }
             }
-        }
 
-        // perform the dot product calculation
-        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
-            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const std::size_t device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+            // update the global array with the local one
+            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                if (device_global_pp_idx < device_specific_num_predict_points && global_class_idx < num_classes) {
                     prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_ptr[global_class_idx];
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
- * @tparam kernel the type of the used kernel function
+ * @brief Predict the @p predict_points using the @p kernel_function.
+ * @tparam target the target platform
+ * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
- * @param[out] prediction the predicted values
- * @param[in] alpha the previously learned weights
- * @param[in] rho the previously learned bias
- * @param[in] support_vectors the support vectors
- * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
- * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) {
-    PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size());
-    PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
-    PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols());
-    PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
-
-    // calculate constants
-    const std::size_t num_classes = alpha.num_rows();
-    const std::size_t num_support_vectors = support_vectors.num_rows();
-    const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_predict_points = predict_points.num_rows();
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_features = predict_points.num_cols();
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_device_specific_num_predict_points * blocked_num_support_vectors);
-    std::iota(range.begin(), range.end(), 0);
-
-    std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), alpha_ptr = alpha.data(), rho_ptr = rho.data(), sv_ptr = support_vectors.data(), pp_ptr = predict_points.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t pp = idx / blocked_num_support_vectors;
-        const std::size_t sv = idx % blocked_num_support_vectors;
-
-        const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t sv_idx = sv * INTERNAL_BLOCK_SIZE_uz;
-
-        // create a thread private array used for internal caching
-        std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-        // iterate over all features
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
-            // perform the feature reduction calculation
-            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
-                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
-
-                    temp[internal_pp][internal_sv] += detail::feature_reduce<kernel>(sv_ptr[dim * (num_support_vectors + PADDING_SIZE_uz) + global_sv_idx],
-                                                                                     pp_ptr[dim * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]);
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
+struct device_kernel_predict {
+    /**
+     * @brief Predict the @p predict_points using the kernel function.
+     * @param[out] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned bias
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
+     * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+     * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
+     * @param[in] kernel_function_parameter the parameters necessary to apply the kernel function
+     */
+    void operator()(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) {
+        PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size());
+        PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
+        PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols());
+        PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }));
+        PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+        PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
+
+        // calculate constants
+        const std::size_t num_classes = alpha.num_rows();
+        const std::size_t device_num_sv = support_vectors.num_rows();
+        const std::size_t num_features = predict_points.num_cols();
+        const std::size_t num_predict_points = predict_points.num_rows();
+        const auto blocked_device_num_sv = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_sv) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> range(blocked_device_num_predict_points * blocked_device_num_sv);
+        std::iota(range.begin(), range.end(), 0);
+
+        std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), alpha_ptr = alpha.data(), rho_ptr = rho.data(), support_vectors_ptr = support_vectors.data(), predict_points_ptr = predict_points.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t pp_idx = (idx / blocked_device_num_sv) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+            const std::size_t sv_idx = (idx % blocked_device_num_sv) * INTERNAL_BLOCK_SIZE_uz;  // num_support_vectors
+
+            // create a thread private array used for internal caching
+            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+            // iterate over all features
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the feature reduction calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                            // calculate the indices to access the global data
+                            const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                            const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                sum += detail::feature_reduce<kernel_function>(support_vectors_ptr[(feature_block + feature) * (device_num_sv + PADDING_SIZE_uz) + global_sv_idx],       // SoA
+                                                                               predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]);  // SoA
+                            }
+                            temp[internal_pp][internal_sv] += sum;
+                        }
+                    }
+                } else {
+                    // perform the feature reduction calculation, the feature is the slowest moving index
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                // calculate the indices to access the global data
+                                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                                const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+
+                                temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(support_vectors_ptr[(feature_block + feature) * (device_num_sv + PADDING_SIZE_uz) + global_sv_idx],       // SoA
+                                                                                                          predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]);  // SoA
+                            }
+                        }
+                    }
                 }
             }
-        }
 
-        // update temp using the respective kernel function
-        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
-            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel>(temp[internal_pp][internal_sv], kernel_function_parameter...);
+            // update temp using the respective kernel function
+            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                    temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter...);
+                }
             }
-        }
 
-        // add results to prediction
-        for (std::size_t a = 0; a < num_classes; ++a) {
+            // atomically add the results to the prediction
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const std::size_t device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                    // be sure to not perform out of bounds accesses
-                    if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) {
+                    for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
                         if (global_sv_idx == 0) {
-                            atomic_ref<real_type>{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + a] } += -rho_ptr[a];
+                            atomic_ref<real_type>{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_idx] } += -rho_ptr[class_idx];
                         }
-                        atomic_ref<real_type>{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + a] } +=
-                            temp[internal_pp][internal_sv] * alpha_ptr[a * (num_support_vectors + PADDING_SIZE_uz) + global_sv_idx];
+                        atomic_ref<real_type>{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_idx] } +=
+                            temp[internal_pp][internal_sv] * alpha_ptr[class_idx * (device_num_sv + PADDING_SIZE_uz) + global_sv_idx];
                     }
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 }  // namespace plssvm::stdpar::detail
 
diff --git a/include/plssvm/constants.hpp b/include/plssvm/constants.hpp
index e99dbeddd..81d992991 100644
--- a/include/plssvm/constants.hpp
+++ b/include/plssvm/constants.hpp
@@ -38,11 +38,8 @@ constexpr unsigned INTERNAL_BLOCK_SIZE = PLSSVM_INTERNAL_BLOCK_SIZE;
 constexpr unsigned INTERNAL_BLOCK_SIZE = 4;
 #endif
 
-/// Global compile time constant used for internal feature caching.
-constexpr unsigned FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE;
-
-/// Padding used for the device w_d matrix to prevent out-of-bounce accesses without ifs.
-constexpr unsigned PADDING_SIZE = FEATURE_BLOCK_SIZE > (THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) ? FEATURE_BLOCK_SIZE : (THREAD_BLOCK_SIZE *INTERNAL_BLOCK_SIZE);
+/// Padding used for the device arrays and matrices to prevent out-of-bounce accesses without ifs.
+constexpr unsigned PADDING_SIZE = THREAD_BLOCK_SIZE *INTERNAL_BLOCK_SIZE;
 
 // perform sanity checks
 static_assert(detail::tuple_contains_v<real_type, detail::supported_real_types>, "Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types.");
diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp
index 6ec7773c4..726165679 100644
--- a/include/plssvm/core.hpp
+++ b/include/plssvm/core.hpp
@@ -13,35 +13,35 @@
 #define PLSSVM_CORE_HPP_
 #pragma once
 
-#include "plssvm/backend_types.hpp"                          // all supported backend types
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // the SYCL implementation type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // the SYCL specific kernel invocation typ
-#include "plssvm/classification_report.hpp"                  // reports different metrics (precision, recall, f1 score, and support) for the different classes after scoring
-#include "plssvm/classification_types.hpp"                   // all supported multi-class classification strategies
-#include "plssvm/constants.hpp"                              // verbosity flag und compile-time constants
-#include "plssvm/csvm_factory.hpp"                           // a factory function to instantiate a C-SVM using a runtime backend; includes the available backend C-SVMs
-#include "plssvm/data_set/classification_data_set.hpp"       // a classification data set used for training a C-SVC
-#include "plssvm/data_set/min_max_scaler.hpp"                // a min-max scaler for the data sets
-#include "plssvm/data_set/regression_data_set.hpp"           // a regression data set used for training a C-SVR
-#include "plssvm/environment.hpp"                            // environment management functions and classes
-#include "plssvm/exceptions/exceptions.hpp"                  // exception hierarchy
-#include "plssvm/file_format_types.hpp"                      // all supported file format types
-#include "plssvm/gamma.hpp"                                  // the types of the gamma parameter
-#include "plssvm/kernel_function_types.hpp"                  // all supported kernel function types
-#include "plssvm/kernel_functions.hpp"                       // implementation of all supported kernel functions
-#include "plssvm/matrix.hpp"                                 // a custom matrix class
-#include "plssvm/model/classification_model.hpp"             // the model as a result of training a C-SVC
-#include "plssvm/model/regression_model.hpp"                 // the model as a result of training a C-SVR
-#include "plssvm/mpi/communicator.hpp"                       // PLSSVM MPI communicator wrapper
-#include "plssvm/parameter.hpp"                              // the C-SVM parameter
-#include "plssvm/regression_report.hpp"                      // reports different metrics (e.g., mean squared error or R^2 score) for the regression task after scoring
-#include "plssvm/shape.hpp"                                  // shape for a matrix or device pointer
-#include "plssvm/solver_types.hpp"                           // all supported solver types (e.g., Conjugate Gradients with explicit, streaming, or implicit kernel matrix generation)
-#include "plssvm/svm/csvc.hpp"                               // the base C-SVC every backend is inheriting from
-#include "plssvm/svm/csvr.hpp"                               // the base C-SVR every backend is inheriting from
-#include "plssvm/target_platforms.hpp"                       // all supported target platforms
-#include "plssvm/verbosity_levels.hpp"                       // all supported verbosity levels
-#include "plssvm/version/version.hpp"                        // version information
+#include "plssvm/backend_types.hpp"                        // all supported backend types
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // the SYCL specific data parallel kernels
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // the SYCL implementation type
+#include "plssvm/classification_report.hpp"                // reports different metrics (precision, recall, f1 score, and support) for the different classes after scoring
+#include "plssvm/classification_types.hpp"                 // all supported multi-class classification strategies
+#include "plssvm/constants.hpp"                            // verbosity flag und compile-time constants
+#include "plssvm/csvm_factory.hpp"                         // a factory function to instantiate a C-SVM using a runtime backend; includes the available backend C-SVMs
+#include "plssvm/data_set/classification_data_set.hpp"     // a classification data set used for training a C-SVC
+#include "plssvm/data_set/min_max_scaler.hpp"              // a min-max scaler for the data sets
+#include "plssvm/data_set/regression_data_set.hpp"         // a regression data set used for training a C-SVR
+#include "plssvm/environment.hpp"                          // environment management functions and classes
+#include "plssvm/exceptions/exceptions.hpp"                // exception hierarchy
+#include "plssvm/file_format_types.hpp"                    // all supported file format types
+#include "plssvm/gamma.hpp"                                // the types of the gamma parameter
+#include "plssvm/kernel_function_types.hpp"                // all supported kernel function types
+#include "plssvm/kernel_functions.hpp"                     // implementation of all supported kernel functions
+#include "plssvm/matrix.hpp"                               // a custom matrix class
+#include "plssvm/model/classification_model.hpp"           // the model as a result of training a C-SVC
+#include "plssvm/model/regression_model.hpp"               // the model as a result of training a C-SVR
+#include "plssvm/mpi/communicator.hpp"                     // PLSSVM MPI communicator wrapper
+#include "plssvm/parameter.hpp"                            // the C-SVM parameter
+#include "plssvm/regression_report.hpp"                    // reports different metrics (e.g., mean squared error or R^2 score) for the regression task after scoring
+#include "plssvm/shape.hpp"                                // shape for a matrix or device pointer
+#include "plssvm/solver_types.hpp"                         // all supported solver types (e.g., Conjugate Gradients with explicit, streaming, or implicit kernel matrix generation)
+#include "plssvm/svm/csvc.hpp"                             // the base C-SVC every backend is inheriting from
+#include "plssvm/svm/csvr.hpp"                             // the base C-SVR every backend is inheriting from
+#include "plssvm/target_platforms.hpp"                     // all supported target platforms
+#include "plssvm/verbosity_levels.hpp"                     // all supported verbosity levels
+#include "plssvm/version/version.hpp"                      // version information
 
 /// The main namespace containing all public API functions.
 namespace plssvm { }
diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp
new file mode 100644
index 000000000..e69de29bb
diff --git a/include/plssvm/detail/cmd/parser_predict.hpp b/include/plssvm/detail/cmd/parser_predict.hpp
index 073e92f6c..8bfa7ef3e 100644
--- a/include/plssvm/detail/cmd/parser_predict.hpp
+++ b/include/plssvm/detail/cmd/parser_predict.hpp
@@ -13,12 +13,12 @@
 #define PLSSVM_DETAIL_CMD_PARSER_PREDICT_HPP_
 #pragma once
 
-#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "fmt/base.h"     // fmt::formatter
 #include "fmt/ostream.h"  // fmt::ostream_formatter
@@ -47,8 +47,8 @@ struct parser_predict {
     /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD, or Intel.
     target_platform target{ target_platform::automatic };
 
-    /// The kernel invocation type when using SYCL as backend.
-    sycl::kernel_invocation_type sycl_kernel_invocation_type{ sycl::kernel_invocation_type::automatic };
+    /// The data parallel kernel when using SYCL as backend.
+    sycl::data_parallel_kernel sycl_data_parallel_kernel{ sycl::data_parallel_kernel::automatic };
     /// The SYCL implementation to use with `--backend sycl`.
     sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic };
 
diff --git a/include/plssvm/detail/cmd/parser_train.hpp b/include/plssvm/detail/cmd/parser_train.hpp
index 6ddae10ac..6253394a6 100644
--- a/include/plssvm/detail/cmd/parser_train.hpp
+++ b/include/plssvm/detail/cmd/parser_train.hpp
@@ -13,17 +13,17 @@
 #define PLSSVM_DETAIL_CMD_PARSER_TRAIN_HPP_
 #pragma once
 
-#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/classification_types.hpp"                   // plssvm::classification_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/parameter.hpp"                              // plssvm::parameter
-#include "plssvm/solver_types.hpp"                           // plssvm::solving_type
-#include "plssvm/svm_types.hpp"                              // plssvm::svm_type
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/classification_types.hpp"                 // plssvm::classification_type
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/parameter.hpp"                            // plssvm::parameter
+#include "plssvm/solver_types.hpp"                         // plssvm::solving_type
+#include "plssvm/svm_types.hpp"                            // plssvm::svm_type
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "fmt/base.h"     // fmt::formatter
 #include "fmt/ostream.h"  // mt::ostream_formatter
@@ -67,8 +67,8 @@ struct parser_train {
     /// The used solver type for the LS-SVM kernel matrix: automatic (depending on the available (V)RAM), cg_explicit, or cg_implicit.
     solver_type solver{ solver_type::automatic };
 
-    /// The kernel invocation type when using SYCL as backend.
-    sycl::kernel_invocation_type sycl_kernel_invocation_type{ sycl::kernel_invocation_type::automatic };
+    /// The data parallel kernel when using SYCL as backend.
+    sycl::data_parallel_kernel sycl_data_parallel_kernel{ sycl::data_parallel_kernel::automatic };
     /// The SYCL implementation to use with --backend=sycl.
     sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic };
 
diff --git a/include/plssvm/detail/cmd/utility.hpp b/include/plssvm/detail/cmd/utility.hpp
new file mode 100644
index 000000000..9cc689868
--- /dev/null
+++ b/include/plssvm/detail/cmd/utility.hpp
@@ -0,0 +1,94 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Utility functions related to the command line parser functionality.
+ */
+
+#ifndef PLSSVM_DETAIL_CMD_UTILITY_HPP_
+#define PLSSVM_DETAIL_CMD_UTILITY_HPP_
+#pragma once
+
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernels
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity_level
+
+#include "cxxopts.hpp"  // cxxopts::ParseResult, cxxopts::Options
+
+#include <cstddef>   // std::size_t
+#include <optional>  // std::optional
+#include <string>    // std::string
+#include <utility>   // std::pair
+#include <vector>    // std::vector
+
+namespace plssvm::detail::cmd {
+
+/**
+ * @brief Filter the provided command line options starting with the @p prefix_filter.
+ * @details Currently, per default filters out all options starting with "--hpx:" and "--kokkos-".
+ * @attention **ONLY** single command line options are supported! I.e., "--hpx:threads=42" is supported, but not "--hpx:threads 42".
+ * @param[in] argc the number of provided command line options to be filtered
+ * @param[in] argv the command line options to be filtered
+ * @param[in] prefix_filter a list of prefixes that should be filtered
+ * @return a `std::vector` containing all non-filtered command line options (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::vector<char *> filter_argv(int argc, char **argv, const std::vector<std::string> &prefix_filter = { "--hpx:", "--kokkos-" });
+
+/**
+ * @brief Assemble a more detailed help message for the kernel function types also containing their mathematical formula.
+ * @return the kernel functions' help message (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::string kernel_type_help_message();
+
+/**
+ * @brief If a SYCL backend is available, parse the SYCL specific command line options "--sycl_data_parallel_kernel" and "--sycl_implementation_type".
+ * @details If a SYCL backend is available, returns the two parsed command line options wrapped in a `std::pair`, otherwise returns a `std::nullopt`.
+ * @param[in] result the cxxopts parser result encapsulating the command line options
+ * @param[in] comm the MPI communicator
+ * @param[in] backend the requested backend
+ * @param[in] target the requested target platform
+ * @return the parsed, SYCL specific command line options (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::optional<std::pair<sycl::data_parallel_kernel, sycl::implementation_type>> parse_and_check_sycl_options_if_available(const cxxopts::ParseResult &result, const mpi::communicator &comm, backend_type backend, target_platform target);
+
+/**
+ * @brief If the Kokkos backend is available, parse the Kokkos specific command line option "--kokkos_execution_space".
+ * @details If the Kokkos backend is available, returns the parsed command line option, otherwise returns a `std::nullopt`.
+ * @param[in] result the cxxopts parser result encapsulating the command line option
+ * @param[in] comm the MPI communicator
+ * @param[in] backend the requested backend
+ * @param[in] target the requested target platform
+ * @return the parsed, Kokkos specific command line option (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::optional<kokkos::execution_space> parse_and_check_kokkos_options_if_available(const cxxopts::ParseResult &result, const mpi::communicator &comm, backend_type backend, target_platform target);
+
+/**
+ * @brief If MPI is available, parse the MPI specific command line option "--mpi_load_balancing_weights".
+ * @details If MPI is available, returns the parsed command line option, otherwise returns a `std::nullopt`.
+ * @param[in] result the cxxopts parser result encapsulating the command line option
+ * @param[in] options all supported command line options
+ * @param[in] comm the MPI communicator
+ * @return the parsed, MPI specific command line option (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::optional<std::vector<std::size_t>> parse_and_check_mpi_options_if_available(const cxxopts::ParseResult &result, const cxxopts::Options &options, const mpi::communicator &comm);
+
+/**
+ * @brief Parse the verbosity command line option.
+ * @details If it was provided, returns the parsed value, otherwise returns a `std::nullopt`.
+ * @param[in] result the cxxopts parser result encapsulating the command line option
+ * @param[in] comm the MPI communicator
+ * @return the parsed verbosity command line option (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::optional<verbosity_level> parse_verbosity(const cxxopts::ParseResult &result, const mpi::communicator &comm);
+
+}  // namespace plssvm::detail::cmd
+
+#endif  // PLSSVM_DETAIL_CMD_UTILITY_HPP_
diff --git a/include/plssvm/detail/data_distribution.hpp b/include/plssvm/detail/data_distribution.hpp
index af4043a79..fd433bcd0 100644
--- a/include/plssvm/detail/data_distribution.hpp
+++ b/include/plssvm/detail/data_distribution.hpp
@@ -23,6 +23,7 @@
 #include <cstddef>    // std::size_t
 #include <iosfwd>     // std::ostream forward declaration
 #include <numeric>    // std::accumulate
+#include <utility>    // std::pair
 #include <vector>     // std::vector
 
 namespace plssvm::detail {
@@ -234,6 +235,22 @@ class triangular_data_distribution : public data_distribution {
      */
     [[nodiscard]] std::vector<memory_size> calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(std::size_t num_features, std::size_t num_classes) const;
 
+    /**
+     * @brief Calculate the theoretical total memory needed per place for assembling the kernel matrix using USM.
+     * @param[in] num_features the total number of features
+     * @param[in] num_classes the total number of classes
+     * @return the theoretical total memory needed per place for cg_streaming (`[[nodiscard]]`)
+     */
+    [[nodiscard]] std::pair<memory_size, std::vector<memory_size>> calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(std::size_t num_features, std::size_t num_classes) const;
+
+    /**
+     * @brief Calculate the theoretical maximum single memory allocation size per place for assembling the kernel matrix using USM.
+     * @param[in] num_features the total number of features
+     * @param[in] num_classes the total number of classes
+     * @return the theoretical maximum single memory allocation size per place for cg_streaming (`[[nodiscard]]`)
+     */
+    [[nodiscard]] std::vector<memory_size> calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(std::size_t num_features, std::size_t num_classes) const;
+
     /**
      * @brief Calculate the theoretical total memory needed per place for implicitly assembling the kernel matrix.
      * @param[in] num_features the total number of features
diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp
new file mode 100644
index 000000000..ca58eec3a
--- /dev/null
+++ b/include/plssvm/detail/make_unique_for_overwrite.hpp
@@ -0,0 +1,125 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite.
+ * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
+ */
+
+#ifndef PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_
+#define PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_
+
+#include "plssvm/detail/assert.hpp"  // PLSSVM_ASSERT
+
+#include <cstddef>      // std::size_t
+#include <cstring>      // std::memset
+#include <memory>       // std::unique_ptr
+#include <type_traits>  // std::false_type, std::true_type, std::enable_if_t, std::is_array_v
+
+namespace plssvm::detail {
+
+/**
+ * @brief Helper struct to check whether @p T is an unbounded array.
+ * @tparam T the array type
+ */
+template <typename T>
+struct is_unbounded_array : std::false_type { };
+
+/**
+ * @brief Specialization of @ref plssvm::detail::is_unbounded_array for unbounded arrays.
+ * @tparam T the array type
+ */
+template <typename T>
+struct is_unbounded_array<T[]> : std::true_type { };
+
+/**
+ * @brief Shortcut for @ref plssvm::detail::is_unbounded_array.
+ * @tparam T the array type
+ */
+template <typename T>
+constexpr bool is_unbounded_array_v = is_unbounded_array<T>::value;
+
+/**
+ * @brief Helper struct to check whether @p T is a bounded array.
+ * @tparam T the array type
+ */
+template <typename T>
+struct is_bounded_array : std::false_type { };
+
+/**
+ * @brief Specialization of @ref plssvm::detail::is_bounded_array for unbounded arrays.
+ * @tparam T the array type
+ * @tparam N the size of the array
+ */
+template <typename T, std::size_t N>
+struct is_bounded_array<T[N]> : std::true_type { };
+
+/**
+ * @brief Shortcut for @ref plssvm::detail::is_bounded_array.
+ * @tparam T the array type
+ */
+template <typename T>
+constexpr bool is_bounded_array_v = is_bounded_array<T>::value;
+
+/**
+ * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite.
+ * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
+ * @tparam T the type of the object to create
+ * @return a unique pointer to the newly created object (`[[nodiscard]]`)
+ */
+template <typename T, std::enable_if_t<std::is_array_v<T>, bool> = true>
+[[nodiscard]] std::unique_ptr<T> make_unique_for_overwrite() {
+    return std::unique_ptr<T>(new T);
+}
+
+/**
+ * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite.
+ * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
+ * @tparam T the type of the objects to create
+ * @param[in] n the size of the array to create
+ * @return a unique pointer to the newly created object (`[[nodiscard]]`)
+ */
+template <typename T, std::enable_if_t<is_unbounded_array_v<T>, bool> = true>
+std::unique_ptr<T> make_unique_for_overwrite(const std::size_t n) {
+    return std::unique_ptr<T>(new std::remove_extent_t<T>[n]);
+}
+
+/**
+ * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite.
+ * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
+ * @tparam T the type of the object to create
+ * @tparam Args the types of the constructor arguments
+ * @param[in] args the arguments to pass to the constructor
+ * @return a unique pointer to the newly created object (`[[nodiscard]]`)
+ */
+template <typename T, typename... Args, std::enable_if_t<is_bounded_array_v<T>, bool> = true>
+auto make_unique_for_overwrite(Args &&...args) = delete;
+
+/**
+ * @brief Fill the array @p dest with zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset.
+ * @tparam T the type of the values
+ * @param[in,out] dest the array to fill with zeros
+ * @param[in] count the number of values to fill
+ */
+template <typename T>
+void parallel_zero_memset(T *dest, const std::size_t count) {
+    PLSSVM_ASSERT(dest != nullptr, "The destination pointer may not be a nullptr!");
+
+// initialize the data pointed to by dest to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset
+#if defined(_OPENMP)
+    #pragma omp parallel for
+    for (std::size_t i = 0; i < count; ++i) {
+        dest[i] = T{ 0 };
+    }
+#else
+    std::memset(dest, 0, count * sizeof(T));
+#endif
+}
+
+}  // namespace plssvm::detail
+
+#endif  // PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_
diff --git a/include/plssvm/detail/type_traits.hpp b/include/plssvm/detail/type_traits.hpp
index 288f8b80b..e29452955 100644
--- a/include/plssvm/detail/type_traits.hpp
+++ b/include/plssvm/detail/type_traits.hpp
@@ -24,6 +24,7 @@
 #include <type_traits>    // std::enable_if_t, std::remove_cv_t, std::remove_reference_t, std::is_same_v, std::false_type, std::true_type, std::is_same_v
 #include <unordered_map>  // std::unordered_map, std::unordered_multimap
 #include <unordered_set>  // std::unordered_set, std::unordered_multiset
+#include <variant>        // std::variant
 #include <vector>         // std::vector
 
 namespace plssvm::detail {
@@ -357,6 +358,25 @@ struct is_one_type_of {
 template <typename T, typename... Types>
 constexpr bool is_one_type_of_v = is_one_type_of<T, Types...>::value;
 
+/**
+ * @brief Type trait to check whether @p T is a `std::variant`.
+ * @tparam T the type to check
+ */
+template <typename T>
+struct is_variant : std::false_type { };
+
+/**
+ * @copybrief plssvm::detail::is_variant
+ */
+template <typename... Args>
+struct is_variant<std::variant<Args...>> : std::true_type { };
+
+/**
+ * @copybrief plssvm::detail::is_variant
+ */
+template <typename T>
+constexpr bool is_variant_v = is_variant<T>::value;
+
 }  // namespace plssvm::detail
 
 #endif  // PLSSVM_DETAIL_TYPE_TRAITS_HPP_
diff --git a/include/plssvm/detail/utility.hpp b/include/plssvm/detail/utility.hpp
index e81d46ae1..613a571cc 100644
--- a/include/plssvm/detail/utility.hpp
+++ b/include/plssvm/detail/utility.hpp
@@ -50,6 +50,21 @@
 
 namespace plssvm::detail {
 
+/**
+ * @brief Shorthand for a more readable `std::visit` overload set.
+ * @tparam Ts the visited types
+ */
+template <class... Ts>
+struct visit_overload : Ts... {
+    using Ts::operator()...;
+};
+
+/**
+ * @brief plssvm::detail::visit_overload
+ */
+template <class... Ts>
+visit_overload(Ts...) -> visit_overload<Ts...>;
+
 /**
  * @brief Invokes undefined behavior. Used to mark code paths that may never be reachable.
  * @details See: C++23 [`std::unreachable`](https://en.cppreference.com/w/cpp/utility/unreachable)
diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp
index 3dec0a5c6..b2ac0bb35 100644
--- a/include/plssvm/environment.hpp
+++ b/include/plssvm/environment.hpp
@@ -18,13 +18,15 @@
 
 #include "plssvm/backend_types.hpp"          // plssvm::backend_type, plssvm::list_available_backends
 #include "plssvm/detail/assert.hpp"          // PLSSVM_ASSERT
+#include "plssvm/detail/cmd/utility.hpp"     // plssvm::detail::cmd::filter_argv
 #include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
 #include "plssvm/detail/utility.hpp"         // plssvm::detail::{contains, unreachable}
 #include "plssvm/exceptions/exceptions.hpp"  // plssvm::environment_exception
 #include "plssvm/mpi/environment.hpp"        // plssvm::mpi::{is_initialized, init}
 
-#if defined(PLSSVM_HAS_HPX_BACKEND)
+#if defined(PLSSVM_HAS_HPX_BACKEND) || defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HPX)
     #include "hpx/execution.hpp"  // ::hpx::post
+    #include "hpx/hpx_main.hpp"   // disable support for HPX's short command line aliases
     #include "hpx/hpx_start.hpp"  // ::hpx::{start, stop, finalize}
     #include "hpx/runtime.hpp"    // ::hpx::{is_running, is_stopped}
 #endif
@@ -239,7 +241,13 @@ inline void initialize_backend([[maybe_unused]] const backend_type backend, [[ma
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     if (backend == backend_type::kokkos) {
-        Kokkos::initialize(argc, argv);
+    #if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HPX)
+        ::hpx::start(nullptr, argc, argv);
+    #endif
+        // we have to filter out our "--kokkos_execution_space" command line option or Kokkos itself will issue a warning on the command line
+        std::vector<char *> filtered_argv = plssvm::detail::cmd::filter_argv(argc, argv, { "--kokkos_" });
+        int filtered_argc = static_cast<int>(filtered_argv.size());
+        Kokkos::initialize(filtered_argc, filtered_argv.data());
     }
 #endif
 }
diff --git a/include/plssvm/mpi/communicator.hpp b/include/plssvm/mpi/communicator.hpp
index 886d13427..d0af17b88 100644
--- a/include/plssvm/mpi/communicator.hpp
+++ b/include/plssvm/mpi/communicator.hpp
@@ -96,6 +96,12 @@ class communicator {
      */
     [[nodiscard]] constexpr static bool is_mpi_enabled() { return PLSSVM_IS_DEFINED(PLSSVM_HAS_MPI_ENABLED); }
 
+    /**
+     * @brief Check whether more than one MPI process is running, i.e., MPI is used to speed-up the computations.
+     * @return `true` if more than one MPI process is running, otherwise `false` ([[nodiscard]])
+     */
+    [[nodiscard]] bool is_mpi_parallel() const { return this->size() > std::size_t{ 1 }; }
+
     /**
      * @brief Returns `true` if the current MPI rank is rank `0`, i.e., the main MPI rank.
      * @details If `PLSSVM_HAS_MPI_ENABLED` is undefined, returns `true`.
diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp
index 378c3d3ea..42c9544ff 100644
--- a/include/plssvm/parameter.hpp
+++ b/include/plssvm/parameter.hpp
@@ -57,8 +57,8 @@ IGOR_MAKE_NAMED_ARGUMENT(solver);
 IGOR_MAKE_NAMED_ARGUMENT(classification);
 /// Create a named argument for the SYCL backend specific SYCL implementation type (DPC++ or AdaptiveCpp).
 IGOR_MAKE_NAMED_ARGUMENT(sycl_implementation_type);
-/// Create a named argument for the SYCL backend specific kernel invocation type.
-IGOR_MAKE_NAMED_ARGUMENT(sycl_kernel_invocation_type);
+/// Create a named argument for the SYCL backend specific data parallel kernels.
+IGOR_MAKE_NAMED_ARGUMENT(sycl_data_parallel_kernel);
 /// Create a named argument for the Kokkos backend specific execution space.
 IGOR_MAKE_NAMED_ARGUMENT(kokkos_execution_space);
 
@@ -76,13 +76,13 @@ constexpr bool has_only_parameter_named_args_v = !igor::has_other_than<Args...>(
  * @brief Trait to check whether @p Args only contains named-parameter that can be used to initialize a `plssvm::parameter` struct including SYCL specific named-parameters.
  */
 template <typename... Args>
-constexpr bool has_only_sycl_parameter_named_args_v = !igor::has_other_than<Args...>(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type);
+constexpr bool has_only_sycl_parameter_named_args_v = !igor::has_other_than<Args...>(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_data_parallel_kernel);
 
 /**
  * @brief Trait to check whether @p Args only contains SYCL specific named-parameters.
  */
 template <typename... Args>
-constexpr bool has_only_sycl_named_args_v = !igor::has_other_than<Args...>(plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type);
+constexpr bool has_only_sycl_named_args_v = !igor::has_other_than<Args...>(plssvm::sycl_implementation_type, plssvm::sycl_data_parallel_kernel);
 
 /**
  * @brief Trait to check whether @p Args only contains named-parameter that can be used to initialize a `plssvm::parameter` struct including Kokkos specific named-parameters.
@@ -215,7 +215,7 @@ struct parameter {
         // compile time check: each named parameter must only be passed once
         static_assert(!parser.has_duplicates(), "Can only use each named parameter once!");
         // compile time check: only some named parameters are allowed
-        static_assert(!parser.has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type, plssvm::kokkos_execution_space),
+        static_assert(!parser.has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_data_parallel_kernel, plssvm::kokkos_execution_space),
                       "An illegal named parameter has been passed!");
 
         // shorthand function for emitting a warning if a provided parameter is not used by the current kernel function
diff --git a/include/plssvm/solver_types.hpp b/include/plssvm/solver_types.hpp
index 3bcbe68f9..83f1e7139 100644
--- a/include/plssvm/solver_types.hpp
+++ b/include/plssvm/solver_types.hpp
@@ -32,6 +32,8 @@ enum class solver_type {
     automatic,
     /** Use the CG algorithm explicitly calculating the kernel matrix and fully storing it on the device. */
     cg_explicit,
+    /** Use the CG algorithm explicitly calculating the kernel matrix and fully storing it on the host. Realized using unified shared memory. */
+    cg_streaming,
     /** Use the CG algorithm implicitly recomputing the kernel matrix each CG iteration (smallest memory footprint). */
     cg_implicit
 };
diff --git a/include/plssvm/svm/csvm.hpp b/include/plssvm/svm/csvm.hpp
index 1acd4738e..fc98a1cdc 100644
--- a/include/plssvm/svm/csvm.hpp
+++ b/include/plssvm/svm/csvm.hpp
@@ -357,6 +357,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
         // calculate the maximum total memory needed for the explicit and implicit kernel matrix per device
         const detail::triangular_data_distribution data_distribution{ comm_, num_rows_reduced, this->num_available_devices() };
         const std::vector<detail::memory_size> total_memory_needed_explicit_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs);
+        const std::pair<detail::memory_size, std::vector<detail::memory_size>> total_memory_needed_streaming_per_device = data_distribution.calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(num_features, num_rhs);
         const std::vector<detail::memory_size> total_memory_needed_implicit_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs);
 
         // format a vector differentiating between it containing only a single entry or multiple
@@ -368,7 +369,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
             }
         };
 
-        if (comm_.size() <= 1) {
+        if (!comm_.is_mpi_parallel()) {
             // output the necessary information on the console, full output only if a single MPI rank is used
             detail::log(verbosity_level::full,
                         comm_,
@@ -378,7 +379,8 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
                         "  - total device memory: {4}\n"
                         "  - usable device memory (with safety margin of min({0} %, {1}): {5}\n"
                         "  - maximum memory needed (cg_explicit): {6}\n"
-                        "  - maximum memory needed (cg_implicit): {7}\n",
+                        "  - maximum memory needed (cg_streaming): {7} (device) + {8} (system total)\n"
+                        "  - maximum memory needed (cg_implicit): {9}\n",
                         static_cast<double>(percentual_safety_margin * 100.0L),
                         minimal_safety_margin,
                         detail::tracking::tracking_entry{ "solver", "system_memory", total_system_memory },
@@ -386,11 +388,15 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
                         format_vector(total_device_memory_per_device),
                         format_vector(usable_device_memory_per_device),
                         format_vector(total_memory_needed_explicit_per_device),
+                        format_vector(total_memory_needed_streaming_per_device.second),
+                        total_memory_needed_streaming_per_device.first,
                         format_vector(total_memory_needed_implicit_per_device));
         }
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_memory", total_device_memory_per_device }));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "usable_device_memory_with_safety_margin", usable_device_memory_per_device }));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_device_memory_cg_explicit", total_memory_needed_explicit_per_device }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_system_memory_cg_streaming", total_memory_needed_streaming_per_device.first }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_device_memory_cg_streaming", total_memory_needed_streaming_per_device.second }));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_device_memory_cg_implicit", total_memory_needed_implicit_per_device }));
 
         // helper function to check whether ALL devices fulfill the requested memory constraint for the specific solver type
@@ -411,7 +417,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
             // use the explicit solver type
             used_solver = solver_type::cg_explicit;
         } else {
-            if (comm_.size() <= 1) {
+            if (!comm_.is_mpi_parallel()) {
                 // output only if a single MPI rank is used
                 detail::log_untracked(verbosity_level::full,
                                       comm_,
@@ -419,13 +425,35 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
                                       format_vector(failed_cg_explicit_constraints));
             }
 
-            // check whether there is enough memory available for cg_implicit
-            if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) {
+            if (const std::vector<std::size_t> failed_cg_streaming_constraints = check_sizes(total_memory_needed_streaming_per_device.second, usable_device_memory_per_device);
+                total_memory_needed_streaming_per_device.first <= usable_system_memory && failed_cg_streaming_constraints.empty()) {
                 // use the implicit solver type
-                used_solver = solver_type::cg_implicit;
+                used_solver = solver_type::cg_streaming;
             } else {
-                // not enough device memory available for the implicit case
-                throw kernel_launch_resources{ fmt::format("Not enough device memory available on device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) };
+                if (!comm_.is_mpi_parallel()) {
+                    // output only if a single MPI rank is used
+                    if (!failed_cg_streaming_constraints.empty()) {
+                        detail::log_untracked(verbosity_level::full,
+                                              comm_,
+                                              "Cannot use cg_streaming due to memory constraints on device(s) {}!\n",
+                                              format_vector(failed_cg_streaming_constraints));
+                    }
+                    if (total_memory_needed_streaming_per_device.first > usable_system_memory) {
+                        // output only if a single MPI rank is used
+                        detail::log_untracked(verbosity_level::full,
+                                              comm_,
+                                              "Cannot use cg_streaming due to system memory constraints!\n");
+                    }
+                }
+
+                // check whether there is enough memory available for cg_implicit
+                if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) {
+                    // use the implicit solver type
+                    used_solver = solver_type::cg_implicit;
+                } else {
+                    // not enough device memory available for the implicit case
+                    throw kernel_launch_resources{ fmt::format("Not enough device memory available on device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) };
+                }
             }
         }
 
@@ -436,22 +464,27 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
 
         // get the maximum single allocation size per device
         const std::vector<detail::memory_size> max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
+        const std::vector<detail::memory_size> max_single_allocation_cg_streaming_size_per_device = data_distribution.calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
         const std::vector<detail::memory_size> max_single_allocation_cg_implicit_size_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
 
         // output the maximum memory allocation size per device
-        if (comm_.size() <= 1) {
+        if (!comm_.is_mpi_parallel()) {
             // output only if a single MPI rank is used
             detail::log_untracked(verbosity_level::full,
                                   comm_,
+                                  "\n"
                                   "  - maximum supported single memory allocation size: {}\n"
                                   "  - maximum needed single memory allocation size (cg_explicit): {}\n"
+                                  "  - maximum needed single memory allocation size (cg_streaming): {}\n"
                                   "  - maximum needed single memory allocation size (cg_implicit): {}\n",
                                   format_vector(max_mem_alloc_size_per_device),
                                   format_vector(max_single_allocation_cg_explicit_size_per_device),
+                                  format_vector(max_single_allocation_cg_streaming_size_per_device),
                                   format_vector(max_single_allocation_cg_implicit_size_per_device));
         }
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_single_mem_alloc_size", max_mem_alloc_size_per_device }));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_explicit", max_single_allocation_cg_explicit_size_per_device }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_streaming", max_single_allocation_cg_streaming_size_per_device }));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_implicit", max_single_allocation_cg_implicit_size_per_device }));
 
         // check whether the maximum single memory allocation sizes per device can be satisfied
@@ -459,20 +492,32 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
         if (const std::vector<std::size_t> failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device);
             used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) {
             // max mem alloc size constraints not fulfilled
-            if (comm_.size() <= 1) {
+            if (!comm_.is_mpi_parallel()) {
                 // output only if a single MPI rank is used
                 detail::log_untracked(verbosity_level::full,
                                       comm_,
-                                      "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n",
+                                      "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_streaming.\n",
                                       format_vector(failed_cg_explicit_constraints));
             }
             // can't use cg_explicit
+            used_solver = solver_type::cg_streaming;
+        }
+        if (const std::vector<std::size_t> failed_cg_streaming_constraints = check_sizes(max_single_allocation_cg_streaming_size_per_device, max_mem_alloc_size_per_device);
+            used_solver == solver_type::cg_streaming && !failed_cg_streaming_constraints.empty()) {
+            // max mem alloc size constraints not fulfilled
+            if (!comm_.is_mpi_parallel()) {
+                detail::log_untracked(verbosity_level::full,
+                                      comm_,
+                                      "Cannot use cg_streaming due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n",
+                                      format_vector(failed_cg_streaming_constraints));
+            }
+            // can't use cg_streaming
             used_solver = solver_type::cg_implicit;
         }
         if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(max_single_allocation_cg_implicit_size_per_device, max_mem_alloc_size_per_device);
             used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) {
             // can't fulfill maximum single memory allocation size even for cg_implicit
-            if (comm_.size() <= 1) {
+            if (!comm_.is_mpi_parallel()) {
                 // output only if a single MPI rank is used
                 plssvm::detail::log_untracked(verbosity_level::full | verbosity_level::warning,
                                               comm_,
@@ -484,7 +529,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
 #endif
     }
 
-    if (comm_.size() <= 1) {
+    if (!comm_.is_mpi_parallel()) {
         // output only if a single MPI rank is used
         detail::log_untracked(verbosity_level::full,
                               comm_,
diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index ee16daba2..60aa4bb10 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -124,12 +124,12 @@ int main(int argc, char *argv[]) {
             if (use_kokkos_as_backend) {
                 backends_to_initialize.push_back(plssvm::backend_type::kokkos);
             }
-            environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
+            environment_guard = std::make_unique<plssvm::environment::scope_guard>(argc, argv, backends_to_initialize);
 
             // create default csvm
             const std::unique_ptr<csvm_type> svm = [&]() {
                 if (use_sycl_as_backend) {
-                    return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type);
+                    return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_data_parallel_kernel = cmd_parser.sycl_data_parallel_kernel);
                 } else if (use_kokkos_as_backend) {
                     return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
                 } else {
diff --git a/src/main_train.cpp b/src/main_train.cpp
index cf4893946..7811b4e82 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -158,12 +158,12 @@ int main(int argc, char *argv[]) {
             if (use_kokkos_as_backend) {
                 backends_to_initialize.push_back(plssvm::backend_type::kokkos);
             }
-            environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
+            environment_guard = std::make_unique<plssvm::environment::scope_guard>(argc, argv, backends_to_initialize);
 
             // create SVM
             const std::unique_ptr<csvm_type> svm = [&]() {
                 if (use_sycl_as_backend) {
-                    return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type);
+                    return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_data_parallel_kernel = cmd_parser.sycl_data_parallel_kernel);
                 } else if (use_kokkos_as_backend) {
                     return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
                 } else {
diff --git a/src/plssvm/backends/CUDA/csvm.cu b/src/plssvm/backends/CUDA/csvm.cu
index 93db10d36..5328eb840 100644
--- a/src/plssvm/backends/CUDA/csvm.cu
+++ b/src/plssvm/backends/CUDA/csvm.cu
@@ -164,7 +164,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const unsigned long long num_rows_reduced = data_d.shape().x - 1;
     const unsigned long long num_features = data_d.shape().y;
     const queue_type &device = devices_[device_id];
@@ -179,7 +179,10 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    // only store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     // convert execution range block to CUDA's native dim3
diff --git a/src/plssvm/backends/CUDA/detail/device_ptr.cu b/src/plssvm/backends/CUDA/detail/device_ptr.cu
index 5d7ba74bb..00f20f66e 100644
--- a/src/plssvm/backends/CUDA/detail/device_ptr.cu
+++ b/src/plssvm/backends/CUDA/detail/device_ptr.cu
@@ -25,21 +25,25 @@
 namespace plssvm::cuda::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const queue_type device) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const size_type size, const queue_type device, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const queue_type device) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const queue_type device, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device) :
-    base_type{ shape, padding, device } {
-    if (queue_ < 0 || queue_ >= static_cast<int>(get_device_count())) {
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device, const bool use_usm_allocations) :
+    base_type{ shape, padding, device, use_usm_allocations } {
+    if (queue_ < 0 || queue_ >= get_device_count()) {
         throw backend_exception{ fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", get_device_count(), queue_) };
     }
     detail::set_device(queue_);
-    PLSSVM_CUDA_ERROR_CHECK(cudaMalloc(&data_, this->size_padded() * sizeof(value_type)))
+    if (use_usm_allocations_) {
+        PLSSVM_CUDA_ERROR_CHECK(cudaMallocManaged(&data_, this->size_padded() * sizeof(value_type)))
+    } else {
+        PLSSVM_CUDA_ERROR_CHECK(cudaMalloc(&data_, this->size_padded() * sizeof(value_type)))
+    }
     this->memset(0);
 }
 
diff --git a/src/plssvm/backends/HIP/csvm.hip b/src/plssvm/backends/HIP/csvm.hip
index 312ad3122..fc2d4e5b7 100644
--- a/src/plssvm/backends/HIP/csvm.hip
+++ b/src/plssvm/backends/HIP/csvm.hip
@@ -180,7 +180,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const unsigned long long num_rows_reduced = data_d.shape().x - 1;
     const unsigned long long num_features = data_d.shape().y;
     const queue_type &device = devices_[device_id];
@@ -195,7 +195,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     // convert execution range block to HIP's native dim3
diff --git a/src/plssvm/backends/HIP/detail/device_ptr.hip b/src/plssvm/backends/HIP/detail/device_ptr.hip
index 560783097..c958c73fd 100644
--- a/src/plssvm/backends/HIP/detail/device_ptr.hip
+++ b/src/plssvm/backends/HIP/detail/device_ptr.hip
@@ -29,21 +29,25 @@
 namespace plssvm::hip::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const queue_type device) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const size_type size, const queue_type device, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const queue_type device) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const queue_type device, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device) :
-    base_type{ shape, padding, device } {
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device, const bool use_usm_allocations) :
+    base_type{ shape, padding, device, use_usm_allocations } {
     if (queue_ < 0 || queue_ >= static_cast<int>(get_device_count())) {
         throw backend_exception{ fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", get_device_count(), queue_) };
     }
     detail::set_device(queue_);
-    PLSSVM_HIP_ERROR_CHECK(hipMalloc(&data_, this->size_padded() * sizeof(value_type)))
+    if (use_usm_allocations_) {
+        PLSSVM_HIP_ERROR_CHECK(hipMallocManaged(&data_, this->size_padded() * sizeof(value_type)))
+    } else {
+        PLSSVM_HIP_ERROR_CHECK(hipMalloc(&data_, this->size_padded() * sizeof(value_type)))
+    }
     this->memset(0);
 }
 
diff --git a/src/plssvm/backends/HPX/csvm.cpp b/src/plssvm/backends/HPX/csvm.cpp
index 71f651688..24f1c3d70 100644
--- a/src/plssvm/backends/HPX/csvm.cpp
+++ b/src/plssvm/backends/HPX/csvm.cpp
@@ -18,6 +18,7 @@
 #include "plssvm/detail/assert.hpp"                                                // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                     // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/logging/mpi_log_untracked.hpp"                             // plssvm::detail::log_untracked
+#include "plssvm/detail/make_unique_for_overwrite.hpp"                             // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset}
 #include "plssvm/detail/memory_size.hpp"                                           // plssvm::detail::memory_size
 #include "plssvm/detail/move_only_any.hpp"                                         // plssvm::detail::{move_only_any, move_only_any_cast}
 #include "plssvm/detail/tracking/performance_tracker.hpp"                          // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY
@@ -113,6 +114,7 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
                     // unreachable
                     break;
                 case solver_type::cg_explicit:
+                case solver_type::cg_streaming:
                     {
                         // calculate the number of data points this device is responsible for
                         const std::size_t device_specific_num_rows = dist.place_specific_num_rows(0);
@@ -120,26 +122,33 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
                         // get the offset of the data points this device is responsible for
                         const std::size_t row_offset = dist.place_row_offset(0);
 
-                        std::vector<real_type> kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));  // only explicitly store the upper triangular matrix
+                        // get the number of kernel matrix entries
+                        const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0);
+
+                        // only explicitly store the upper triangular matrix
+                        auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite<real_type[]>(num_entries);
+                        // initialize kernel matrix to all zeros in parallel
+                        ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries);
+
                         const auto start = std::chrono::steady_clock::now();
                         switch (params.kernel_type) {
                             case kernel_function_type::linear:
-                                detail::device_kernel_assembly<kernel_function_type::linear>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
+                                detail::device_kernel_assembly<kernel_function_type::linear>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
                                 break;
                             case kernel_function_type::polynomial:
-                                detail::device_kernel_assembly<kernel_function_type::polynomial>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                                detail::device_kernel_assembly<kernel_function_type::polynomial>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<real_type>(params.gamma), params.coef0);
                                 break;
                             case kernel_function_type::rbf:
-                                detail::device_kernel_assembly<kernel_function_type::rbf>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                                detail::device_kernel_assembly<kernel_function_type::rbf>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                                 break;
                             case kernel_function_type::sigmoid:
-                                detail::device_kernel_assembly<kernel_function_type::sigmoid>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma), params.coef0);
+                                detail::device_kernel_assembly<kernel_function_type::sigmoid>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma), params.coef0);
                                 break;
                             case kernel_function_type::laplacian:
-                                detail::device_kernel_assembly<kernel_function_type::laplacian>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                                detail::device_kernel_assembly<kernel_function_type::laplacian>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                                 break;
                             case kernel_function_type::chi_squared:
-                                detail::device_kernel_assembly<kernel_function_type::chi_squared>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                                detail::device_kernel_assembly<kernel_function_type::chi_squared>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                                 break;
                         }
                         const auto end = std::chrono::steady_clock::now();
@@ -199,17 +208,18 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                     // unreachable
                     break;
                 case solver_type::cg_explicit:
+                case solver_type::cg_streaming:
                     {
-                        const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::vector<real_type> &>(A.front());
-                        PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!");
+                        const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::unique_ptr<real_type[]> &>(A.front());
+                        PLSSVM_ASSERT(explicit_A != nullptr, "The A matrix must not be empty!");
 
                         const auto start = std::chrono::steady_clock::now();
 
-                        detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                        detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
 
                         const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
                         if (num_mirror_rows > std::size_t{ 0 }) {
-                            detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                            detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
                         }
 
                         const auto end = std::chrono::steady_clock::now();
@@ -261,6 +271,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
     });
     // wait until operation is completed
     wait.get();
+    // restore padding entries by setting them to zero
+    C.restore_padding();
 }
 
 //***************************************************//
@@ -317,6 +329,8 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
                     [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration }));
                 }
+                // restore padding entries by setting them to zero
+                w.restore_padding();
 
                 // reduce w on all MPI ranks
                 comm_.allreduce_inplace(w);
@@ -358,6 +372,9 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
     });
     // wait until operation is completed
     wait.get();
+
+    // restore padding entries by setting them to zero
+    out.restore_padding();
     return out;
 }
 
diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
index 371991c1f..818cb4794 100644
--- a/src/plssvm/backends/Kokkos/CMakeLists.txt
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -28,6 +28,7 @@ set(PLSSVM_KOKKOS_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/memory_space.cpp
 )
 
 # set target properties
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 2bf512433..e41aa14f4 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -20,7 +20,7 @@
 #include "plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::kokkos::detail::device_kernel_assembly
 #include "plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::kokkos::detail::device_kernel_assembly_symm
 #include "plssvm/backends/Kokkos/kernel/predict_kernel.hpp"                           // plssvm::kokkos::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
-#include "plssvm/constants.hpp"                                                       // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE
+#include "plssvm/constants.hpp"                                                       // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
 #include "plssvm/detail/assert.hpp"                                                   // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/logging/log_untracked.hpp"                                    // plssvm::detail::log_untracked
@@ -51,9 +51,11 @@
 #include <limits>     // std::numeric_limits::max
 #include <map>        // std::map
 #include <string>     // std::string
-#include <utility>    // std::move
+#include <utility>    // std::move, std::forward
 #include <vector>     // std::vector
 
+namespace {
+
 // a dummy class used as functor to the team_size_max function
 template <typename ExecutionSpace>
 struct dummy {
@@ -61,6 +63,114 @@ struct dummy {
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &) const { }
 };
 
+/**
+ * @brief Run the kernel functor on the given device.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] partial_grid the number of work-groups in each dimension of the execution grid
+ * @param[in] block the number of work-items in each dimension per work-group
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <typename KernelFunctor, typename TeamPolicy, typename... Args>
+void run_kernel_functor(const std::string &kernel_name, const TeamPolicy &policy, Args &&...args) {
+    Kokkos::parallel_for(kernel_name, policy, KernelFunctor{ std::forward<Args>(args)... });
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam ExecutionSpace the used Kokkos execution space
+ * @tparam USMEnabledMemorySpace the possibly USM enabled Kokkos memory space
+ * @tparam target the target platform to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <typename, typename, plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename ExecutionSpace, typename USMEnabledMemorySpace, plssvm::target_platform target, typename... Args>
+void dispatch_kernel_function_type(const plssvm::parameter &params, Args &&...args) {
+    switch (params.kernel_type) {
+        case plssvm::kernel_function_type::linear:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::linear>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::kernel_function_type::polynomial:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::rbf:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::rbf, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::sigmoid:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::laplacian:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::laplacian, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::chi_squared:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::chi_squared, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform and kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam ExecutionSpace the used Kokkos execution space
+ * @tparam USMEnabledMemorySpace the possibly USM enabled Kokkos memory space
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <typename, typename, plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename ExecutionSpace, typename USMEnabledMemorySpace, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, const plssvm::parameter &params, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::kokkos::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_nvidia>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_amd>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_intel>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::cpu>(params, std::forward<Args>(args)...);
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam ExecutionSpace the used Kokkos execution space
+ * @tparam USMEnabledMemorySpace the possibly USM enabled Kokkos memory space
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <typename, typename, plssvm::target_platform> typename KernelFunctor, typename ExecutionSpace, typename USMEnabledMemorySpace, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::kokkos::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_nvidia>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_amd>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_intel>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::cpu>>(std::forward<Args>(args)...);
+            break;
+    }
+}
+
+}  // namespace
+
 namespace plssvm::kokkos {
 
 void csvm::init(const target_platform target) {
@@ -330,7 +440,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
         // NOTE: CUDA + HIP + SYCL: returns the maximum possible number of threads, due to no further limitations in the dummy functor (like, e.g., scratch memory)
         // NOTE: HPX + Serial: hardcoded to 1
         // NOTE: OpenMP: should be 1-2; most likely 1
-        // NOTE: Threads: should be equal to number of hardware threads IF hwloc is enabled; otherwise 1
+        // NOTE: Threads: should be 1; however, always only uses a single core if Kokkos wasn't built with hwloc support
         // NOTE: OpenMPTarget: hardcoded to 256
         // NOTE: OpenACC: hardcoded to 512
 
@@ -398,7 +508,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::s
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const unsigned long long num_rows_reduced = data_d.shape().x - 1;
     const unsigned long long num_features = data_d.shape().y;
 
@@ -412,9 +522,12 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, devices_[device_id] };  // only explicitly store the upper triangular matrix
+    // only store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, devices_[device_id], use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
-    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
     // save the team size
     const int team_size = detail::dim_type_to_native(exec.block);
@@ -431,43 +544,13 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            switch (params.kernel_type) {
-                case kernel_function_type::linear:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::linear>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x });
-                    }
-                    break;
-                case kernel_function_type::polynomial:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::rbf:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::rbf, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::sigmoid:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::laplacian:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::laplacian, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::chi_squared:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::chi_squared, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
+            // get the underlying kernel matrix view based on whether we wanted USM allocations or not
+            if (kernel_matrix_d.get().uses_usm_allocations()) {
+                using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, true>;
+                dispatch_target_platform<detail::device_kernel_assembly, kokkos_execution_space_type, kokkos_memory_space_type>(target_, params, fmt::format("assemble_kernel_matrix_explicit_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), kernel_matrix_d.get().get<space, true>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x);
+            } else {
+                using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, false>;
+                dispatch_target_platform<detail::device_kernel_assembly, kokkos_execution_space_type, kokkos_memory_space_type>(target_, params, fmt::format("assemble_kernel_matrix_explicit_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), kernel_matrix_d.get().get<space, false>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x);
             }
         }
         detail::device_synchronize(device);
@@ -492,7 +575,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
         // get the offset of the data points this device is responsible for
         const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
         // the necessary amount of scratch memory for the kernels
-        const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+        const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
         // save the team size
         const int team_size = detail::dim_type_to_native(exec.block);
@@ -505,7 +588,13 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            Kokkos::parallel_for("blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm<kokkos_execution_space_type>{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
+            if (A_d.get().uses_usm_allocations()) {
+                using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, true>;
+                dispatch_target_platform<detail::device_kernel_symm, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get<space, true>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
+            } else {
+                using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, false>;
+                dispatch_target_platform<detail::device_kernel_symm, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get<space, false>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
+            }
         }
 
         // save the team size
@@ -521,7 +610,13 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
                 // create a Kokkos TeamPolicy
                 Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, mirror_team_size };
 
-                Kokkos::parallel_for("blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm_mirror<kokkos_execution_space_type>{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
+                if (A_d.get().uses_usm_allocations()) {
+                    using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, true>;
+                    dispatch_target_platform<detail::device_kernel_symm_mirror, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get<space, true>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
+                } else {
+                    using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, false>;
+                    dispatch_target_platform<detail::device_kernel_symm_mirror, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get<space, false>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
+                }
             }
         }
         detail::device_synchronize(device);
@@ -585,6 +680,7 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
     devices_[device_id].execute([&](auto &device) {
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
+        using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type>;
 
         // calculate the number of data points this device is responsible for
         const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
@@ -592,7 +688,7 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
         const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
 
         const real_type cost_factor = real_type{ 1.0 } / params.cost;
-        const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+        const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
         // save the team size
         const int team_size = detail::dim_type_to_native(exec.block);
@@ -605,44 +701,7 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            switch (params.kernel_type) {
-                case kernel_function_type::linear:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::linear>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x });
-                    }
-                    break;
-                case kernel_function_type::polynomial:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::rbf:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::rbf, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::sigmoid:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::laplacian:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::laplacian, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::chi_squared:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::chi_squared, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-            }
+            dispatch_target_platform<detail::device_kernel_assembly_symm, kokkos_execution_space_type, kokkos_memory_space_type>(target_, params, fmt::format("assemble_kernel_matrix_implicit_blas_level_3_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x);
         }
         detail::device_synchronize(device);
         const auto end = std::chrono::steady_clock::now();
@@ -674,6 +733,7 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
     return devices_[device_id].execute_and_return([&](auto &device) {
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
+        using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type>;
 
         const auto start = std::chrono::steady_clock::now();
         for (const auto &[partial_grid, offsets] : exec.grids) {
@@ -683,7 +743,7 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            Kokkos::parallel_for("w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_w_linear<kokkos_execution_space_type>{ w_d.get().get<space>(), alpha_d.get().get<space>(), sv_d.get().get<space>(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x });
+            dispatch_target_platform<detail::device_kernel_w_linear, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), w_d.get().get<space>(), alpha_d.get().get<space>(), sv_d.get().get<space>(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x);
         }
         detail::device_synchronize(device);
         const auto end = std::chrono::steady_clock::now();
@@ -702,7 +762,7 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
     device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, devices_[device_id] };
 
-    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
     // save the team size
     const int team_size = detail::dim_type_to_native(exec.block);
@@ -710,6 +770,7 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
     return devices_[device_id].execute_and_return([&](auto &device) {
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
+        using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type>;
 
         const auto start = std::chrono::steady_clock::now();
         for (const auto &[partial_grid, offsets] : exec.grids) {
@@ -719,43 +780,10 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            switch (params.kernel_type) {
-                case kernel_function_type::linear:
-                    {
-                        using functor_type = detail::device_kernel_predict_linear<kokkos_execution_space_type>;
-                        Kokkos::parallel_for("predict_kernel_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), sv_or_w_d.get().get<space>(), rho_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x });
-                    }
-                    break;
-                case kernel_function_type::polynomial:
-                    {
-                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("predict_kernel_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::rbf:
-                    {
-                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::rbf, real_type>;
-                        Kokkos::parallel_for("predict_kernel_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::sigmoid:
-                    {
-                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("predict_kernel_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::laplacian:
-                    {
-                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::laplacian, real_type>;
-                        Kokkos::parallel_for("predict_kernel_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::chi_squared:
-                    {
-                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::chi_squared, real_type>;
-                        Kokkos::parallel_for("predict_kernel_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
+            if (params.kernel_type == kernel_function_type::linear) {
+                dispatch_target_platform<detail::device_kernel_predict_linear, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "predict_kernel_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), out_d.get().get<space>(), sv_or_w_d.get().get<space>(), rho_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x);
+            } else {
+                dispatch_target_platform<detail::device_kernel_predict, kokkos_execution_space_type, kokkos_memory_space_type>(target_, params, fmt::format("predict_kernel_linear_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x);
             }
         }
         detail::device_synchronize(device);
diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index 0dfe9adc0..6d07e340a 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -36,17 +36,17 @@ template <typename T>
 using host_view_type = Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const device_wrapper &device) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const size_type size, const device_wrapper &device, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const device_wrapper &device) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const device_wrapper &device, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const device_wrapper &device) :
-    base_type{ shape, padding, device } {
-    data_ = make_device_view_wrapper<T *>(device, this->size_padded());
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const device_wrapper &device, const bool use_usm_allocations) :
+    base_type{ shape, padding, device, use_usm_allocations } {
+    data_ = make_device_view_wrapper<T *>(device, this->size_padded(), use_usm_allocations_);
     this->memset(0);
 }
 
diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index 00202edf7..e899b1048 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -119,15 +119,8 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
             break;
         case execution_space::openmp:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP([&]() {
-                // Note: if OpenMP should be used as device  must be set in order for it to work!
-                if (omp_get_nested() == 0) {
-                    ::plssvm::detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                                    comm,
-                                                    "WARNING: In order for Kokkos::OpenMP to work properly, we have to set \"omp_set_nested(1)\"!\n");
-                    // enable OMP_NESTED support
-                    // Note: function is officially deprecated but still necessary for Kokkos::OpenMP to work properly
-                    omp_set_nested(1);
-                }
+                // Note: if OpenMP should be used as device OMP_NESTED must be set in order for it to work!
+                omp_set_max_active_levels(2);
                 devices.emplace_back(Kokkos::OpenMP{});
             });
             break;
diff --git a/src/plssvm/backends/Kokkos/memory_space.cpp b/src/plssvm/backends/Kokkos/memory_space.cpp
new file mode 100644
index 000000000..5dd2ecc47
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/memory_space.cpp
@@ -0,0 +1,74 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/memory_space.hpp"
+
+#include "plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_memory_spaces
+#include "plssvm/detail/string_utility.hpp"                                     // plssvm::detail::to_lower_case
+
+#include <array>    // std::array
+#include <ios>      // std::ios::failbit
+#include <istream>  // std::istream
+#include <ostream>  // std::ostream
+#include <string>   // std::string
+#include <vector>   // std::vector
+
+namespace plssvm::kokkos {
+
+std::ostream &operator<<(std::ostream &out, const memory_space space) {
+    switch (space) {
+        case memory_space::host_space:
+            return out << "HostSpace";
+        case memory_space::cuda_space:
+            return out << "CudaSpace";
+        case memory_space::cuda_usm_space:
+            return out << "CudaUVMSpace";
+        case memory_space::hip_space:
+            return out << "HIPSpace";
+        case memory_space::hip_usm_space:
+            return out << "HIPManagedSpace";
+        case memory_space::sycl_space:
+            return out << "SYCLDeviceUSMSpace";
+        case memory_space::sycl_usm_space:
+            return out << "SYCLSharedUSMSpace";
+    }
+    return out << "unknown";
+}
+
+std::istream &operator>>(std::istream &in, memory_space &space) {
+    std::string str{};
+    in >> str;
+    ::plssvm::detail::to_lower_case(str);
+
+    if (str == "hostspace" || str == "host_space") {
+        space = memory_space::host_space;
+    } else if (str == "cudaspace" || str == "cuda_space") {
+        space = memory_space::cuda_space;
+    } else if (str == "cudauvmspace" || str == "cuda_uvm_space" || str == "cudausmspace" || str == "cuda_usm_space") {
+        space = memory_space::cuda_usm_space;
+    } else if (str == "hipspace" || str == "hip_space") {
+        space = memory_space::hip_space;
+    } else if (str == "hipmanagedspace" || str == "hip_managed_space" || str == "hipusmspace" || str == "hip_usm_space") {
+        space = memory_space::hip_usm_space;
+    } else if (str == "sycldeviceusmspace" || str == "sycl_device_usm_space" || str == "syclspace" || str == "sycl_space") {
+        space = memory_space::sycl_space;
+    } else if (str == "syclsharedusmspace" || str == "sycl_shared_usm_space" || str == "syclusmspace" || str == "sycl_usm_space") {
+        space = memory_space::sycl_usm_space;
+    } else {
+        in.setstate(std::ios::failbit);
+    }
+    return in;
+}
+
+std::vector<memory_space> list_available_memory_spaces() {
+    // get all available memory spaces
+    constexpr auto arr = detail::constexpr_available_memory_spaces();
+    return std::vector<memory_space>{ arr.begin(), arr.end() };
+}
+
+}  // namespace plssvm::kokkos
diff --git a/src/plssvm/backends/OpenCL/CMakeLists.txt b/src/plssvm/backends/OpenCL/CMakeLists.txt
index c79869c05..04c63b213 100644
--- a/src/plssvm/backends/OpenCL/CMakeLists.txt
+++ b/src/plssvm/backends/OpenCL/CMakeLists.txt
@@ -47,6 +47,9 @@ target_sources(
             BASE_DIRS
             "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel"
             FILES
+            "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/detail/atomics.cl"
+            "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl"
+            "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl"
             "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl"
             "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl"
             "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl"
diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp
index 0f69a606c..6255c8777 100644
--- a/src/plssvm/backends/OpenCL/csvm.cpp
+++ b/src/plssvm/backends/OpenCL/csvm.cpp
@@ -97,7 +97,7 @@ csvm::csvm(const target_platform target) {
 
     // create command_queues and JIT compile OpenCL kernels; compile all kernels for float and double
     detail::jit_info info{};
-    std::tie(devices_, info) = detail::create_command_queues(comm_, contexts_, params_.kernel_type);
+    std::tie(devices_, info) = detail::create_command_queues(comm_, contexts_, target_, params_.kernel_type);
 
     std::vector<std::string> device_names{};
     device_names.reserve(devices_.size());
@@ -163,8 +163,18 @@ csvm::csvm(const target_platform target) {
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "jit_compilation_time", info.duration }));
 
     // sanity checks for the number of the OpenCL kernels
-    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return queue.kernels.size() == 13; }),
-                  "Every command queue must have exactly thirteen associated kernels!");
+    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return queue.kernels.size() == 17; }),
+                  "Every command queue must have exactly 17 associated kernels!");
+
+    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return ::plssvm::detail::contains(queue.kernels, detail::compute_kernel_name::fill_kernel_float); }),
+                  "The double device pointer fill kernel is missing!");
+    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return ::plssvm::detail::contains(queue.kernels, detail::compute_kernel_name::fill_kernel_double); }),
+                  "The float device pointer fill kernel is missing!");
+
+    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return ::plssvm::detail::contains(queue.kernels, detail::compute_kernel_name::memset_kernel_float); }),
+                  "The double device pointer memset kernel is missing!");
+    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return ::plssvm::detail::contains(queue.kernels, detail::compute_kernel_name::memset_kernel_double); }),
+                  "The float device pointer memset kernel is missing!");
 
     PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return ::plssvm::detail::contains(queue.kernels, detail::compute_kernel_name::assemble_kernel_matrix_explicit); }),
                   "The explicit kernel matrix assembly device kernel is missing!");
@@ -270,7 +280,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::s
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const cl_ulong num_rows_reduced = data_d.shape().x - 1;
     const cl_ulong num_features = data_d.shape().y;
     const queue_type &device = devices_[device_id];
@@ -285,7 +295,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     // convert execution range block to OpenCL's native std::vector
diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
index 032a6b8b6..7da1e54a6 100644
--- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
+++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
@@ -11,41 +11,55 @@
 #include "plssvm/backends/gpu_device_ptr.hpp"               // plssvm::detail::gpu_device_ptr
 #include "plssvm/backends/OpenCL/detail/command_queue.hpp"  // plssvm::opencl::detail::command_queue
 #include "plssvm/backends/OpenCL/detail/error_code.hpp"     // plssvm::opencl::detail::error_code
+#include "plssvm/backends/OpenCL/detail/kernel.hpp"         // plssvm::opencl::detail::{kernel, compute_kernel_name}
 #include "plssvm/backends/OpenCL/detail/utility.hpp"        // PLSSVM_OPENCL_ERROR_CHECK
 #include "plssvm/backends/OpenCL/exceptions.hpp"            // plssvm::opencl::backend_exception
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
+#include "plssvm/detail/type_traits.hpp"                    // plssvm::detail::always_false_v
 #include "plssvm/exceptions/exceptions.hpp"                 // plssvm::exception
 #include "plssvm/shape.hpp"                                 // plssvm::shape
 
-#include "CL/cl.h"  // CL_MEM_READ_WRITE, CL_TRUE, clFinish, clCreateBuffer, clReleaseMemObject, clEnqueueFillBuffer, clEnqueueWriteBuffer, clEnqueueReadBuffer
+#include "CL/cl.h"  // cl_uchar, cl_ulong, cl_mem, CL_MEM_READ_WRITE, CL_TRUE,
+                    // clFinish, clCreateBuffer, clReleaseMemObject, clEnqueueFillBuffer, clEnqueueWriteBuffer, clEnqueueReadBuffer, clSetKernelArg, clEnqueueNDRangeKernel
 
 #include "fmt/format.h"  // fmt::format
 
-#include <algorithm>  // std::min
-#include <array>      // std::array
-#include <cstddef>    // std::size_t
-#include <exception>  // std::terminate
-#include <iostream>   // std::cerr, std::endl
-#include <vector>     // std::vector
+#include <algorithm>    // std::min
+#include <array>        // std::array
+#include <cstddef>      // std::size_t
+#include <cstring>      // std::memcpy
+#include <exception>    // std::terminate
+#include <iostream>     // std::cerr, std::endl
+#include <type_traits>  // std::is_same_v
+#include <variant>      // std::variant
+#include <vector>       // std::vector
 
 namespace plssvm::opencl::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const command_queue &queue) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, queue } { }
+device_ptr<T>::device_ptr(const size_type size, const command_queue &queue, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, queue, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const command_queue &queue) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, queue } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const command_queue &queue, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, queue, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const command_queue &queue) :
-    base_type{ shape, padding, &queue } {
-    error_code err{};
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const command_queue &queue, const bool use_usm_allocations) :
+    base_type{ shape, padding, &queue, use_usm_allocations } {
     cl_context cont{};
     PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast<void *>(&cont), nullptr), "error retrieving the command queue context")
-    data_ = clCreateBuffer(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), nullptr, &err);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the buffer")
+    if (use_usm_allocations_) {
+        T *usm_ptr = static_cast<T *>(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0));
+        if (usm_ptr == nullptr) {
+            throw backend_exception{ fmt::format("Failed to allocate {} of memory using clSVMAlloc(...). Maybe that's larger than CL_DEVICE_MAX_MEM_ALLOC_SIZE?", ::plssvm::detail::memory_size{ this->size_padded() * sizeof(value_type) }) };
+        }
+        data_ = usm_ptr;
+    } else {
+        error_code err{};
+        data_ = clCreateBuffer(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), nullptr, &err);
+        PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the buffer")
+    }
     this->memset(0);
 }
 
@@ -53,8 +67,18 @@ template <typename T>
 device_ptr<T>::~device_ptr() {
     // avoid compiler warnings
     try {
-        if (data_ != nullptr) {
-            PLSSVM_OPENCL_ERROR_CHECK(clReleaseMemObject(data_), "error releasing the buffer")
+        if (use_usm_allocations_) {
+            T *usm_ptr = std::get<T *>(data_);
+            if (usm_ptr != nullptr) {
+                cl_context cont{};
+                PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast<void *>(&cont), nullptr), "error retrieving the command queue context")
+                clSVMFree(cont, usm_ptr);
+            }
+        } else {
+            cl_mem mem = std::get<cl_mem>(data_);
+            if (mem != nullptr) {
+                PLSSVM_OPENCL_ERROR_CHECK(clReleaseMemObject(mem), "error releasing the buffer")
+            }
         }
     } catch (const plssvm::exception &e) {
         std::cout << e.what_with_loc() << std::endl;
@@ -64,86 +88,151 @@ device_ptr<T>::~device_ptr() {
 
 template <typename T>
 void device_ptr<T>::memset(const int pattern, const size_type pos, const size_type num_bytes) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (pos >= this->size_padded()) {
         throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) };
     }
     const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
-    error_code err;
-    const auto correct_value = static_cast<unsigned char>(pattern);
-    err = clEnqueueFillBuffer(queue_->queue, data_, &correct_value, sizeof(unsigned char), pos * sizeof(value_type), rnum_bytes, 0, nullptr, nullptr);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via memset")
+
+    // we have to use ul_char for the correct pattern
+    const auto correct_pattern = static_cast<cl_uchar>(pattern);
+    if (use_usm_allocations_) {
+        PLSSVM_OPENCL_ERROR_CHECK(clEnqueueSVMMemFill(queue_->queue, std::get<T *>(data_) + pos, &correct_pattern, sizeof(unsigned char), rnum_bytes, 0, nullptr, nullptr), "error filling the buffer via memset");
+    } else {
+        // we have to use the number of elements and not the number of bytes
+        const auto rcount = static_cast<cl_ulong>(rnum_bytes / sizeof(value_type));
+
+        // get the correct device kernel based on the current value_type
+        const kernel *device_kernel = nullptr;
+        if constexpr (std::is_same_v<value_type, float>) {
+            device_kernel = &queue_->get_kernel(detail::compute_kernel_name::memset_kernel_float);
+        } else if constexpr (std::is_same_v<value_type, double>) {
+            device_kernel = &queue_->get_kernel(detail::compute_kernel_name::memset_kernel_double);
+        } else {
+            static_assert(plssvm::detail::always_false_v<T>, "Unsupported value type!");
+        }
+        PLSSVM_ASSERT(device_kernel != nullptr, "The device kernel pointer is invalid!");
+
+        // set the kernel arguments and run the kernel
+        PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 0 }, sizeof(cl_mem), &data_), "error setting device_ptr memset data_ argument");
+        PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 1 }, sizeof(cl_uchar), &correct_pattern), "error setting device_ptr memset pattern argument");
+        PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 2 }, sizeof(cl_ulong), &pos), "error setting device_ptr memset pos argument");
+        PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 3 }, sizeof(cl_ulong), &rcount), "error setting device_ptr memset size argument");
+        PLSSVM_OPENCL_ERROR_CHECK(clEnqueueNDRangeKernel(queue_->queue, *device_kernel, cl_uint{ 1 }, nullptr, &rcount, nullptr, 0, nullptr, nullptr), "error running device_ptr memset kernel");
+    }
     detail::device_synchronize(*queue_);
 }
 
 template <typename T>
 void device_ptr<T>::fill(const value_type value, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (pos >= this->size_padded()) {
         throw backend_exception{ fmt::format("Illegal access in fill!: {} >= {}", pos, this->size_padded()) };
     }
-
-    // run GPU kernel
     const size_type rcount = std::min(count, this->size_padded() - pos);
-    error_code err;
-    err = clEnqueueFillBuffer(queue_->queue, data_, &value, sizeof(value_type), pos * sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via fill")
+
+    if (use_usm_allocations_) {
+        PLSSVM_OPENCL_ERROR_CHECK(clEnqueueSVMMemFill(queue_->queue, std::get<T *>(data_) + pos, &value, sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr), "error filling the buffer via fill");
+    } else {
+        // get the correct device kernel based on the current value_type
+        const kernel *device_kernel = nullptr;
+        if constexpr (std::is_same_v<value_type, float>) {
+            device_kernel = &queue_->get_kernel(detail::compute_kernel_name::fill_kernel_float);
+        } else if constexpr (std::is_same_v<value_type, double>) {
+            device_kernel = &queue_->get_kernel(detail::compute_kernel_name::fill_kernel_double);
+        } else {
+            static_assert(plssvm::detail::always_false_v<T>, "Unsupported value type!");
+        }
+        PLSSVM_ASSERT(device_kernel != nullptr, "The device kernel pointer is invalid!");
+
+        // set the kernel arguments and run the kernel
+        PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 0 }, sizeof(cl_mem), &data_), "error setting device_ptr fill data_ argument");
+        PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 1 }, sizeof(value_type), &value), "error setting device_ptr fill pattern argument");
+        PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 2 }, sizeof(cl_ulong), &pos), "error setting device_ptr fill pos argument");
+        PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 3 }, sizeof(cl_ulong), &rcount), "error setting device_ptr fill size argument");
+        PLSSVM_OPENCL_ERROR_CHECK(clEnqueueNDRangeKernel(queue_->queue, *device_kernel, cl_uint{ 1 }, nullptr, &rcount, nullptr, 0, nullptr, nullptr), "error running device_ptr fill kernel");
+    }
     detail::device_synchronize(*queue_);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
+
     error_code err;
-    err = clEnqueueWriteBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), data_to_copy, 0, nullptr, nullptr);
+    if (use_usm_allocations_) {
+        err = clEnqueueSVMMemcpy(queue_->queue, CL_TRUE, std::get<T *>(data_) + pos, data_to_copy, rcount * sizeof(value_type), 0, nullptr, nullptr);
+    } else {
+        err = clEnqueueWriteBuffer(queue_->queue, std::get<cl_mem>(data_), CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), data_to_copy, 0, nullptr, nullptr);
+    }
     PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data to the device buffer")
     detail::device_synchronize(*queue_);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t spitch, const std::size_t width, const std::size_t height) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     if (width > spitch) {
         throw backend_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) };
     }
 
-    const std::array<std::size_t, 3> buffer_origin{ 0, 0, 0 };
-    const std::array<std::size_t, 3> host_origin{ 0, 0, 0 };
-    const std::array<std::size_t, 3> region{ width * sizeof(value_type), height, 1 };
-    const std::size_t buffer_row_pitch = this->shape_padded().x * sizeof(value_type);
-    const std::size_t buffer_slice_pitch = 0;
-    const std::size_t host_row_pitch = spitch * sizeof(value_type);
-    const std::size_t host_slice_pitch = 0;
-
-    error_code err;
-    err = clEnqueueWriteBufferRect(queue_->queue, data_, CL_TRUE, buffer_origin.data(), host_origin.data(), region.data(), buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, data_to_copy, 0, nullptr, nullptr);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the strided data to the device buffer")
-    detail::device_synchronize(*queue_);
+    if (use_usm_allocations_) {
+        if (spitch == width) {
+            // can use normal copy since we have no line strides
+            this->copy_to_device(data_to_copy, 0, width * height);
+        } else {
+            std::vector<value_type> temp(this->shape_padded().x * height, value_type{ 0.0 });
+            value_type *pos = temp.data();
+            for (std::size_t row = 0; row < height; ++row) {
+                std::memcpy(pos, data_to_copy + row * spitch, width * sizeof(value_type));
+                pos += this->shape_padded().x;
+            }
+            this->copy_to_device(temp);
+        }
+    } else {
+        const std::array<std::size_t, 3> buffer_origin{ 0, 0, 0 };
+        const std::array<std::size_t, 3> host_origin{ 0, 0, 0 };
+        const std::array<std::size_t, 3> region{ width * sizeof(value_type), height, 1 };
+        const std::size_t buffer_row_pitch = this->shape_padded().x * sizeof(value_type);
+        const std::size_t buffer_slice_pitch = 0;
+        const std::size_t host_row_pitch = spitch * sizeof(value_type);
+        const std::size_t host_slice_pitch = 0;
+
+        error_code err;
+        err = clEnqueueWriteBufferRect(queue_->queue, std::get<cl_mem>(data_), CL_TRUE, buffer_origin.data(), host_origin.data(), region.data(), buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, data_to_copy, 0, nullptr, nullptr);
+        PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the strided data to the device buffer")
+        detail::device_synchronize(*queue_);
+    }
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
+
     error_code err;
-    err = clEnqueueReadBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), buffer, 0, nullptr, nullptr);
+    if (use_usm_allocations_) {
+        err = clEnqueueSVMMemcpy(queue_->queue, CL_TRUE, buffer, std::get<T *>(data_) + pos, rcount * sizeof(value_type), 0, nullptr, nullptr);
+    } else {
+        err = clEnqueueReadBuffer(queue_->queue, std::get<cl_mem>(data_), CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), buffer, 0, nullptr, nullptr);
+    }
     PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data from the device buffer")
     detail::device_synchronize(*queue_);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_other_device(device_ptr &target, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
-    PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(target.get() != device_pointer_type{}, "Invalid target pointer! Maybe target has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
     if (target.size_padded() < rcount) {
diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp
index 6b3f686ae..20837a667 100644
--- a/src/plssvm/backends/OpenCL/detail/utility.cpp
+++ b/src/plssvm/backends/OpenCL/detail/utility.cpp
@@ -13,7 +13,7 @@
 #include "plssvm/backends/OpenCL/detail/error_code.hpp"     // plssvm::opencl::detail::error_code
 #include "plssvm/backends/OpenCL/detail/jit_info.hpp"       // plssvm::opencl::detail::jit_info
 #include "plssvm/backends/OpenCL/detail/kernel.hpp"         // plssvm::opencl::detail::compute_kernel_name, plssvm::opencl::detail::kernel
-#include "plssvm/constants.hpp"                             // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                             // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/arithmetic_type_name.hpp"           // plssvm::detail::arithmetic_type_name
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
 #include "plssvm/detail/logging/mpi_log_untracked.hpp"      // plssvm::detail::log_untracked
@@ -31,7 +31,7 @@
 #include "CL/cl.h"           // cl_program, cl_platform_id, cl_device_id, cl_uint, cl_device_type, cl_context,
                              // CL_DEVICE_NAME, CL_QUEUE_DEVICE, CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU, CL_DEVICE_VENDOR, CL_PROGRAM_BUILD_LOG, CL_PROGRAM_BINARY_SIZES, CL_PROGRAM_BINARIES, CL_PLATFORM_VENDOR, CL_DRIVER_VERSION,
                              // clCreateProgramWithSource, clBuildProgram, clGetProgramBuildInfo, clGetProgramInfo, clCreateKernel, clReleaseProgram, clCreateProgramWithBinary,
-                             // clSetKernelArg, clEnqueueNDRangeKernel, clFinish, clGetPlatformIDs, clGetDeviceIDs, clGetDeviceInfo, clCreateContext, clGetPlatformInfo
+                             // clFinish, clGetPlatformIDs, clGetDeviceIDs, clGetDeviceInfo, clCreateContext, clGetPlatformInfo
 #include "CL/cl_platform.h"  // cl_uint
 
 #include "fmt/format.h"  // fmt::format
@@ -191,6 +191,12 @@ std::string get_device_name(const command_queue &queue) {
 std::vector<std::pair<compute_kernel_name, std::string>> kernel_type_to_function_names() {
     // since the correct predict kernel function cannot be determined during construction, add all predict kernels
     std::vector<std::pair<compute_kernel_name, std::string>> kernels{
+        // fill_kernel.cl
+        std::make_pair(compute_kernel_name::fill_kernel_float, "device_fill_kernel_float"),
+        std::make_pair(compute_kernel_name::fill_kernel_double, "device_fill_kernel_double"),
+        // memset_kernel.cl
+        std::make_pair(compute_kernel_name::memset_kernel_float, "device_memset_kernel_float"),
+        std::make_pair(compute_kernel_name::memset_kernel_double, "device_memset_kernel_double"),
         // kernel_matrix_assembly.cl
         std::make_pair(compute_kernel_name::assemble_kernel_matrix_explicit, "device_kernel_assembly"),
         // blas.cl
@@ -213,7 +219,7 @@ std::vector<std::pair<compute_kernel_name, std::string>> kernel_type_to_function
     return kernels;
 }
 
-std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi::communicator &comm, const std::vector<context> &contexts, const kernel_function_type kernel_function) {
+std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi::communicator &comm, const std::vector<context> &contexts, const target_platform target, const kernel_function_type kernel_function) {
     jit_info info{};
     const auto jit_start_time = std::chrono::steady_clock::now();
 
@@ -281,6 +287,8 @@ std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi:
     std::string kernel_src_string{};
     // note: the detail/atomics.cl file must be included first!
     for (const auto &path : { base_path / "detail/atomics.cl",
+                              base_path / "detail/fill_kernel.cl",
+                              base_path / "detail/memset_kernel.cl",
                               base_path / "kernel_functions.cl",
                               base_path / "cg_explicit/blas.cl",
                               base_path / "cg_explicit/kernel_matrix_assembly.cl",
@@ -336,6 +344,8 @@ std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi:
     // replace the generic strings in the kernel_src_string
     replace_kernel_function_type_placeholders(kernel_src_string, kernel_function);
 
+    // TODO: use defines? -DTHREAD_BLOCK_SIZE=32 ...
+
     // read generic predict kernel
     std::ifstream predict_file{ base_path / "predict_kernel.cl" };
     std::string predict_kernel_src_string{};
@@ -358,16 +368,21 @@ std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi:
 
     // replace constants in kernel_src_string
     // replace the size_t variants -> BEFORE replacing the "normal" values
-    ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE_ul", fmt::format("(ulong) {}", THREAD_BLOCK_SIZE));
-    ::plssvm::detail::replace_all(kernel_src_string, "FEATURE_BLOCK_SIZE_ul", fmt::format("(ulong) {}", FEATURE_BLOCK_SIZE));
-    ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE_ul", fmt::format("(ulong) {}", INTERNAL_BLOCK_SIZE));
-    ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE_ul", fmt::format("(ulong) {}", PADDING_SIZE));
+    ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE_uz", fmt::format("(ulong) {}", THREAD_BLOCK_SIZE));
+    ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE_uz", fmt::format("(ulong) {}", INTERNAL_BLOCK_SIZE));
+    ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE_uz", fmt::format("(ulong) {}", PADDING_SIZE));
     // replace the normal variants
     ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE", fmt::format("{}", THREAD_BLOCK_SIZE));
-    ::plssvm::detail::replace_all(kernel_src_string, "FEATURE_BLOCK_SIZE", fmt::format("{}", FEATURE_BLOCK_SIZE));
     ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE", fmt::format("{}", INTERNAL_BLOCK_SIZE));
     ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE", fmt::format("{}", PADDING_SIZE));
 
+    // set compile definition checking whether we are executing on a CPU or not
+    for (std::string &options : compile_options) {
+        if (target == target_platform::cpu) {
+            options += " -DPLSSVM_OPENCL_TARGET_CPUS";
+        }
+    }
+
     // get all device names
     std::vector<std::string> device_names{};
     for (auto &context : contexts) {
@@ -544,8 +559,8 @@ std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi:
     }
 
     std::vector<command_queue> queues{};
-    // compile kernels for each context, i.e., each device
 
+    // compile kernels for each context, i.e., each device
     for (std::size_t idx = 0; idx < contexts.size(); ++idx) {
         auto &context = contexts[idx];
         auto &device = context.device;
diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp
index 7a7c17ef2..a483b0636 100644
--- a/src/plssvm/backends/OpenMP/csvm.cpp
+++ b/src/plssvm/backends/OpenMP/csvm.cpp
@@ -19,6 +19,7 @@
 #include "plssvm/detail/assert.hpp"                                                   // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/logging/mpi_log_untracked.hpp"                                // plssvm::detail::log_untracked
+#include "plssvm/detail/make_unique_for_overwrite.hpp"                                // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset}
 #include "plssvm/detail/memory_size.hpp"                                              // plssvm::detail::memory_size
 #include "plssvm/detail/move_only_any.hpp"                                            // plssvm::detail::{move_only_any, move_only_any_cast}
 #include "plssvm/detail/tracking/performance_tracker.hpp"                             // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY
@@ -118,6 +119,7 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
                 // unreachable
                 break;
             case solver_type::cg_explicit:
+            case solver_type::cg_streaming:
                 {
                     // calculate the number of data points this device is responsible for
                     const std::size_t device_specific_num_rows = dist.place_specific_num_rows(0);
@@ -125,26 +127,33 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
                     // get the offset of the data points this device is responsible for
                     const std::size_t row_offset = dist.place_row_offset(0);
 
-                    std::vector<real_type> kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));  // only explicitly store the upper triangular matrix
+                    // get the number of kernel matrix entries
+                    const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0);
+
+                    // only explicitly store the upper triangular matrix
+                    auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite<real_type[]>(num_entries);
+                    // initialize kernel matrix to all zeros in parallel
+                    ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries);
+
                     const auto start = std::chrono::steady_clock::now();
                     switch (params.kernel_type) {
                         case kernel_function_type::linear:
-                            detail::device_kernel_assembly<kernel_function_type::linear>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
+                            detail::device_kernel_assembly<kernel_function_type::linear>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
                             break;
                         case kernel_function_type::polynomial:
-                            detail::device_kernel_assembly<kernel_function_type::polynomial>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                            detail::device_kernel_assembly<kernel_function_type::polynomial>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<real_type>(params.gamma), params.coef0);
                             break;
                         case kernel_function_type::rbf:
-                            detail::device_kernel_assembly<kernel_function_type::rbf>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                            detail::device_kernel_assembly<kernel_function_type::rbf>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                             break;
                         case kernel_function_type::sigmoid:
-                            detail::device_kernel_assembly<kernel_function_type::sigmoid>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma), params.coef0);
+                            detail::device_kernel_assembly<kernel_function_type::sigmoid>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma), params.coef0);
                             break;
                         case kernel_function_type::laplacian:
-                            detail::device_kernel_assembly<kernel_function_type::laplacian>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                            detail::device_kernel_assembly<kernel_function_type::laplacian>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                             break;
                         case kernel_function_type::chi_squared:
-                            detail::device_kernel_assembly<kernel_function_type::chi_squared>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                            detail::device_kernel_assembly<kernel_function_type::chi_squared>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                             break;
                     }
                     const auto end = std::chrono::steady_clock::now();
@@ -201,17 +210,18 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                 // unreachable
                 break;
             case solver_type::cg_explicit:
+            case solver_type::cg_streaming:
                 {
-                    const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::vector<real_type> &>(A.front());
-                    PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!");
+                    const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::unique_ptr<real_type[]> &>(A.front());
+                    PLSSVM_ASSERT(explicit_A != nullptr, "The A matrix must not be empty!");
 
                     const auto start = std::chrono::steady_clock::now();
 
-                    detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                    detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
 
                     const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
                     if (num_mirror_rows > std::size_t{ 0 }) {
-                        detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                        detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
                     }
 
                     const auto end = std::chrono::steady_clock::now();
@@ -260,6 +270,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                 break;
         }
     }
+    // restore padding entries by setting them to zero
+    C.restore_padding();
 }
 
 //***************************************************//
@@ -315,6 +327,8 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
                 [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration }));
             }
+            // restore padding entries by setting them to zero
+            w.restore_padding();
 
             // reduce w on all MPI ranks
             comm_.allreduce_inplace(w);
@@ -354,6 +368,8 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "predict_kernel", duration }));
     }
 
+    // restore padding entries by setting them to zero
+    out.restore_padding();
     return out;
 }
 
diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt b/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
index 07b18e4f4..ae3e0b050 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
@@ -20,9 +20,11 @@ if (PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP)
     set(ACPP_TARGETS "generic" CACHE STRING "" FORCE)
     if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS)
         message(
-            WARNING "Enabled SYCL's hierarchical and AdapitveCpp's scoped kernels in AdaptiveCpp while using its SSCP compilation flow. "
-                    "SSCP, however, does currently not implement these kernel invocation types resulting in a runtime exception. "
+            WARNING "Enabled SYCL's hierarchical and AdaptiveCpp's scoped kernels in AdaptiveCpp while using its SSCP compilation flow. "
+                    "SSCP, however, does currently not implement these data parallel kernels resulting in a runtime exception. "
                     "If you wish to use them, set \"PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP\" to \"OFF\" and use one of the legacy compilation flows. "
+                    "Alternatively, SYCL's hierarchical and AdaptiveCpp's scoped kernels can be disabled by setting "
+                    "\"PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS\" to \"OFF\"."
         )
     endif ()
 else ()
diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
index 6f0772db0..6991a6400 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
@@ -13,6 +13,7 @@
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp"                                // plssvm::adaptivecpp::detail::::device_ptr
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue_impl.hpp"                                // plssvm::adaptivecpp::detail::queue (PImpl implementation)
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp"                                   // plssvm::adaptivecpp::detail::{get_device_list, device_synchronize, get_adaptivecpp_version_short, get_adaptivecpp_version}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"                                        // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/exceptions.hpp"                                                   // plssvm::adaptivecpp::backend_exception
 #include "plssvm/backends/SYCL/implementation_types.hpp"                                         // plssvm::sycl::implementation_type
 #include "plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp"                                // plssvm::sycl::detail::basic::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
@@ -31,7 +32,6 @@
 #include "plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp"                   // plssvm::sycl::detail::hierarchical::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
 #include "plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp"                         // plssvm::sycl::detail::scoped::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
 #include "plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp"                     // plssvm::sycl::detail::work_group::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"                                      // plssvm::kernel_invocation_type
 #include "plssvm/constants.hpp"                                                                  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                                                              // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                                   // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution}
@@ -64,9 +64,148 @@
 #include <limits>     // std::numeric_limits::max
 #include <string>     // std::string
 #include <tuple>      // std::tie
+#include <utility>    // std::forward
 #include <variant>    // std::get
 #include <vector>     // std::vector
 
+namespace {
+
+/**
+ * @brief Run the kernel functor on the given device.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam QueueType the type of the SYCL queue to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] device the SYCL queue to run the kernel on
+ * @param[in] partial_grid the number of work-groups in each dimension of the execution grid
+ * @param[in] block the number of work-items in each dimension per work-group
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <typename KernelFunctor, typename QueueType, typename... Args>
+void run_kernel_functor(const QueueType &device, const plssvm::detail::dim_type partial_grid, const plssvm::detail::dim_type block, Args &&...args) {
+    constexpr plssvm::sycl::data_parallel_kernel data_parallel_kernel_type = KernelFunctor::data_parallel_kernel_type;
+
+    if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::basic) {
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::basic>(partial_grid, block),
+                             KernelFunctor{ std::forward<Args>(args)... });
+        });
+    } else if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::work_group) {
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::work_group>(partial_grid, block),
+                             KernelFunctor{ cgh, std::forward<Args>(args)... });
+        });
+    } else if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::hierarchical) {
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::hierarchical>(partial_grid, block);
+            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward<Args>(args)... });
+        });
+#else
+        throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
+#endif
+    } else if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::scoped) {
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::scoped>(partial_grid, block);
+            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward<Args>(args)... });
+        });
+#else
+        throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::data_parallel_kernel::scoped was disabled!" };
+#endif
+    } else {
+        static_assert(::plssvm::detail::always_false_v<Args...>, "Unsupported kernel function!");
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam target the target platform to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, plssvm::target_platform target, typename... Args>
+void dispatch_kernel_function_type(const plssvm::parameter &params, Args &&...args) {
+    switch (params.kernel_type) {
+        case plssvm::kernel_function_type::linear:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::linear>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::kernel_function_type::polynomial:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::rbf:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::rbf, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::sigmoid:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::laplacian:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::laplacian, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::chi_squared:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::chi_squared, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform and kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, const plssvm::parameter &params, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::adaptivecpp::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_nvidia>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_amd>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_intel>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::cpu>(params, std::forward<Args>(args)...);
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::adaptivecpp::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_nvidia>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_amd>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_intel>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::cpu>>(std::forward<Args>(args)...);
+            break;
+    }
+}
+
+}  // namespace
+
 namespace plssvm::adaptivecpp {
 
 void csvm::init(const target_platform target) {
@@ -107,10 +246,10 @@ void csvm::init(const target_platform target) {
         throw backend_exception{ fmt::format("SYCL backend selected but no devices for the target {} were found!", target_) };
     }
 
-    // set correct kernel invocation type if "automatic" has been provided
-    if (invocation_type_ == sycl::kernel_invocation_type::automatic) {
+    // set the correct data parallel kernel if "automatic" has been provided
+    if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::automatic) {
         // always use work_group for AdaptiveCpp
-        invocation_type_ = sycl::kernel_invocation_type::work_group;
+        data_parallel_kernel_type_ = sycl::data_parallel_kernel::work_group;
         if (target_ == target_platform::cpu) {  // TODO: set to hierarchical or scoped?!
 #if !defined(__ACPP_USE_ACCELERATED_CPU__) && defined(__ACPP_ENABLE_OMPHOST_TARGET__)
             plssvm::detail::log_untracked(verbosity_level::full | verbosity_level::warning,
@@ -128,15 +267,15 @@ void csvm::init(const target_platform target) {
             device_names.emplace_back(device.impl->sycl_queue.get_device().template get_info<::sycl::info::device::name>());
         }
 
-        mpi::detail::gather_and_print_csvm_information(comm_, plssvm::backend_type::sycl, target_, device_names, fmt::format("{}", invocation_type_));
+        mpi::detail::gather_and_print_csvm_information(comm_, plssvm::backend_type::sycl, target_, device_names, fmt::format("{}", data_parallel_kernel_type_));
     } else {
         // use more detailed single rank command line output
         plssvm::detail::log_untracked(verbosity_level::full,
                                       comm_,
-                                      "\nUsing AdaptiveCpp ({}; {}) as SYCL backend with the kernel invocation type \"{}\".\n",
+                                      "\nUsing AdaptiveCpp ({}; {}) as SYCL backend with the data parallel kernel \"{}\".\n",
                                       detail::get_adaptivecpp_version_short(),
                                       PLSSVM_ACPP_TARGETS,
-                                      invocation_type_);
+                                      data_parallel_kernel_type_);
         if (target == target_platform::automatic) {
             plssvm::detail::log_untracked(verbosity_level::full,
                                           comm_,
@@ -167,7 +306,7 @@ void csvm::init(const target_platform target) {
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "dependencies", "adaptivecpp_version", detail::get_adaptivecpp_version() }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "backend", plssvm::backend_type::sycl }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_implementation_type", plssvm::sycl::implementation_type::adaptivecpp }));
-    PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_kernel_invocation_type", invocation_type_ }));
+    PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_data_parallel_kernel", data_parallel_kernel_type_ }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "target_platform", target_ }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "num_devices", devices_.size() }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "device", device_names }));
@@ -237,7 +376,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::s
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const std::size_t num_rows_reduced = data_d.shape().x - 1;
     const std::size_t num_features = data_d.shape().y;
     const queue_type &device = devices_[device_id];
@@ -252,278 +391,28 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_assembly<kernel_function_type::linear>{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
                 break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::data_parallel_kernel::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::data_parallel_kernel::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::data_parallel_kernel::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
+            case sycl::data_parallel_kernel::scoped:
+                dispatch_target_platform<sycl::detail::scoped::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
         }
     }
     detail::device_synchronize(device);
@@ -546,41 +435,20 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                     sycl::detail::basic::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                     sycl::detail::work_group::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
+            case sycl::data_parallel_kernel::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                break;
-            case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                    cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
+            case sycl::data_parallel_kernel::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
+            case sycl::data_parallel_kernel::scoped:
+                dispatch_target_platform<sycl::detail::scoped::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
         }
     }
 
@@ -588,41 +456,20 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
         const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
 
         if (num_mirror_rows > 0) {
-            switch (invocation_type_) {
-                case sycl::kernel_invocation_type::automatic:
-                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                case sycl::kernel_invocation_type::basic:
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, mirror_exec.block),
-                                         sycl::detail::basic::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::work_group:
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, mirror_exec.block),
-                                         sycl::detail::work_group::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
-                    break;
-                case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, mirror_exec.block);
-                        cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
-#else
-                    throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+                case sycl::data_parallel_kernel::work_group:
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, mirror_exec.block);
-                        cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
-#else
-                    throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
+                case sycl::data_parallel_kernel::hierarchical:
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
+                case sycl::data_parallel_kernel::scoped:
+                    dispatch_target_platform<sycl::detail::scoped::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
             }
         }
     }
@@ -637,39 +484,39 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss
     const queue_type &device = devices_[device_id];
 
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::basic>(partial_grid_ref, exec.block),
                                      sycl::detail::basic::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::work_group>(partial_grid_ref, exec.block),
                                      sycl::detail::work_group::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::hierarchical>(partial_grid_ref, exec.block);
                     cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
 #endif
                 break;
-            case sycl::kernel_invocation_type::scoped:
+            case sycl::data_parallel_kernel::scoped:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::scoped>(partial_grid_ref, exec.block);
                     cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::scoped was disabled!" };
 #endif
                 break;
         }
@@ -682,39 +529,39 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm:
     const queue_type &device = devices_[device_id];
 
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::basic>(partial_grid_ref, exec.block),
                                      sycl::detail::basic::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::work_group>(partial_grid_ref, exec.block),
                                      sycl::detail::work_group::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::hierarchical>(partial_grid_ref, exec.block);
                     cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
 #endif
                 break;
-            case sycl::kernel_invocation_type::scoped:
+            case sycl::data_parallel_kernel::scoped:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::scoped>(partial_grid_ref, exec.block);
                     cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::scoped was disabled!" };
 #endif
                 break;
         }
@@ -737,273 +584,21 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::linear>{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
                 break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::data_parallel_kernel::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::data_parallel_kernel::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::data_parallel_kernel::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
+            case sycl::data_parallel_kernel::scoped:
+                dispatch_target_platform<sycl::detail::scoped::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
         }
     }
     detail::device_synchronize(device);
@@ -1030,40 +625,20 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                     sycl::detail::basic::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                     sycl::detail::work_group::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
+            case sycl::data_parallel_kernel::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+            case sycl::data_parallel_kernel::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                    cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
+            case sycl::data_parallel_kernel::scoped:
+                dispatch_target_platform<sycl::detail::scoped::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
         }
     }
     detail::device_synchronize(device);
@@ -1085,273 +660,38 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
+        if (params.kernel_type == kernel_function_type::linear) {
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::work_group:
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::hierarchical:
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::scoped:
+                    dispatch_target_platform<sycl::detail::scoped::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+            }
+        } else {
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::work_group:
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::hierarchical:
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::scoped:
+                    dispatch_target_platform<sycl::detail::scoped::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+            }
         }
     }
     detail::device_synchronize(device);
diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
index 0338d10c9..54c5d2d33 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
@@ -12,6 +12,7 @@
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue_impl.hpp"  // plssvm::adaptivecpp::detail::queue (PImpl implementation)
 #include "plssvm/backends/SYCL/exceptions.hpp"                     // plssvm::adaptivecpp::backend_exception
 #include "plssvm/detail/assert.hpp"                                // PLSSVM_ASSERT
+#include "plssvm/detail/make_unique_for_overwrite.hpp"             // plssvm::detail::parallel_zero_memset
 #include "plssvm/matrix.hpp"                                       // plssvm::aos_matrix
 #include "plssvm/shape.hpp"                                        // plssvm::shape
 
@@ -26,17 +27,21 @@
 namespace plssvm::adaptivecpp::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const queue &q) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q } { }
+device_ptr<T>::device_ptr(const size_type size, const queue &q, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const queue &q) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, q } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const queue &q, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue &q) :
-    base_type{ shape, padding, q } {
-    data_ = ::sycl::malloc_device<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue &q, const bool use_usm_allocations) :
+    base_type{ shape, padding, q, use_usm_allocations } {
+    if (use_usm_allocations_) {
+        data_ = ::sycl::malloc_shared<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+    } else {
+        data_ = ::sycl::malloc_device<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+    }
     this->memset(0);
 }
 
@@ -56,7 +61,14 @@ void device_ptr<T>::memset(const int pattern, const size_type pos, const size_ty
         throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) };
     }
     const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
-    queue_.impl->sycl_queue.memset(static_cast<void *>(data_ + pos), pattern, rnum_bytes).wait();
+
+    ::sycl::queue &queue = queue_.impl->sycl_queue;
+    // using our OpenMP enhanced 0 memset functions has dramatically better performance on the OpenMP CPU backend
+    if (pattern == 0 && queue.get_device().is_cpu() && queue.get_device().get_backend() == ::sycl::backend::omp) {
+        ::plssvm::detail::parallel_zero_memset(data_ + pos, rnum_bytes / sizeof(value_type));
+    } else {
+        queue.memset(static_cast<void *>(data_ + pos), pattern, rnum_bytes).wait();
+    }
 }
 
 template <typename T>
diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index d0de8c7f2..7fa57bf79 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -20,10 +20,10 @@ set(PLSSVM_SYCL_BACKEND_FOUND_IMPLEMENTATIONS "")
 #                                             check if SYCL can be enabled                                             #
 ########################################################################################################################
 
-# enable kernel invocation types
-option(PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS "Enables SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation types." ON)
+# enable data parallel kernels
+option(PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS "Enables SYCL's hierarchical data parallel kernel and AdaptiveCpp's scoped parallelism." ON)
 if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS)
-    message(STATUS "Enable SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation types.")
+    message(STATUS "Enable SYCL's hierarchical data parallel kernel and AdaptiveCpp's scoped parallelism.")
 endif ()
 
 # add AdaptiveCpp
@@ -114,7 +114,7 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_DPCPP_LIBRARY_NAME})
     )
 endif ()
 
-# add kernel invocation type compile definitions
+# add data parallel kernel compile definitions
 if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS)
     target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} INTERFACE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
     target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
@@ -143,7 +143,7 @@ append_local_and_parent(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_SYCL_BACKEND_LIBRARY_
 
 # set manpage strings
 set_local_and_parent(PLSSVM_SYCL_BACKEND_NAME_LIST "automatic;${PLSSVM_SYCL_BACKEND_FOUND_IMPLEMENTATIONS}")
-set_local_and_parent(PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST "automatic;basic;work_group${PLSSVM_SYCL_KERNEL_HIERARCHICAL_AND_SCOPED_NAME_LIST}")
+set_local_and_parent(PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST "automatic;basic;work_group${PLSSVM_SYCL_KERNEL_HIERARCHICAL_AND_SCOPED_NAME_LIST}")
 
 # populate transformed ACPP_TARGETS for tests
 if (TARGET ${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME})
diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
index 7c56bcd91..50f2a10d7 100644
--- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
@@ -10,6 +10,7 @@
 
 #include "plssvm/backend_types.hpp"                                                              // plssvm::backend_type
 #include "plssvm/backends/execution_range.hpp"                                                   // plssvm::detail::{dim_type, execution_range}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"                                        // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp"                                      // plssvm::dpcpp::detail::::device_ptr
 #include "plssvm/backends/SYCL/DPCPP/detail/queue_impl.hpp"                                      // plssvm::dpcpp::detail::queue (PImpl implementation)
 #include "plssvm/backends/SYCL/DPCPP/detail/utility.hpp"                                         // plssvm::dpcpp::detail::{get_device_list, device_synchronize, get_dpcpp_version}
@@ -27,7 +28,6 @@
 #include "plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp"                          // plssvm::sycl::detail::basic::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
 #include "plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp"                   // plssvm::sycl::detail::hierarchical::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
 #include "plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp"                     // plssvm::sycl::detail::work_group::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"                                      // plssvm::kernel_invocation_type
 #include "plssvm/constants.hpp"                                                                  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                                                              // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                                   // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution}
@@ -60,9 +60,139 @@
 #include <string>       // std::string
 #include <string_view>  // std::string_view
 #include <tuple>        // std::tie
+#include <utility>      // std::forward
 #include <variant>      // std::get
 #include <vector>       // std::vector
 
+namespace {
+
+/**
+ * @brief Run the kernel functor on the given device.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam QueueType the type of the SYCL queue to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] device the SYCL queue to run the kernel on
+ * @param[in] partial_grid the number of work-groups in each dimension of the execution grid
+ * @param[in] block the number of work-items in each dimension per work-group
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <typename KernelFunctor, typename QueueType, typename... Args>
+void run_kernel_functor(const QueueType &device, const plssvm::detail::dim_type partial_grid, const plssvm::detail::dim_type block, Args &&...args) {
+    constexpr plssvm::sycl::data_parallel_kernel data_parallel_kernel_type = KernelFunctor::data_parallel_kernel_type;
+
+    if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::basic) {
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            cgh.parallel_for(plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::basic>(partial_grid, block),
+                             KernelFunctor{ std::forward<Args>(args)... });
+        });
+    } else if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::work_group) {
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            cgh.parallel_for(plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::work_group>(partial_grid, block),
+                             KernelFunctor{ cgh, std::forward<Args>(args)... });
+        });
+    } else if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::hierarchical) {
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            const auto exec_range = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::hierarchical>(partial_grid, block);
+            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward<Args>(args)... });
+        });
+#else
+        throw plssvm::dpcpp::backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
+#endif
+    } else {
+        static_assert(::plssvm::detail::always_false_v<Args...>, "Unsupported kernel function!");
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam target the target platform to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, plssvm::target_platform target, typename... Args>
+void dispatch_kernel_function_type(const plssvm::parameter &params, Args &&...args) {
+    switch (params.kernel_type) {
+        case plssvm::kernel_function_type::linear:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::linear>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::kernel_function_type::polynomial:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::rbf:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::rbf, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::sigmoid:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::laplacian:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::laplacian, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::chi_squared:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::chi_squared, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform and kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, const plssvm::parameter &params, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::dpcpp::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_nvidia>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_amd>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_intel>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::cpu>(params, std::forward<Args>(args)...);
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::dpcpp::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_nvidia>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_amd>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_intel>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::cpu>>(std::forward<Args>(args)...);
+            break;
+    }
+}
+
+}  // namespace
+
 namespace plssvm::dpcpp {
 
 void csvm::init(const target_platform target) {
@@ -103,10 +233,10 @@ void csvm::init(const target_platform target) {
         throw backend_exception{ fmt::format("SYCL backend selected but no devices for the target {} were found!", target_) };
     }
 
-    // set correct kernel invocation type if "automatic" has been provided
-    if (invocation_type_ == sycl::kernel_invocation_type::automatic) {
+    // set the correct data parallel kernel if "automatic" has been provided
+    if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::automatic) {
         // always use work_group for DPC++
-        invocation_type_ = sycl::kernel_invocation_type::work_group;
+        data_parallel_kernel_type_ = sycl::data_parallel_kernel::work_group;
     }
 
     std::vector<std::string> device_names{};
@@ -118,15 +248,15 @@ void csvm::init(const target_platform target) {
             device_names.emplace_back(device.impl->sycl_queue.get_device().template get_info<::sycl::info::device::name>());
         }
 
-        mpi::detail::gather_and_print_csvm_information(comm_, plssvm::backend_type::sycl, target_, device_names, fmt::format("{}", invocation_type_));
+        mpi::detail::gather_and_print_csvm_information(comm_, plssvm::backend_type::sycl, target_, device_names, fmt::format("{}", data_parallel_kernel_type_));
     } else {
         // use more detailed single rank command line output
         plssvm::detail::log_untracked(verbosity_level::full,
                                       comm_,
-                                      "\nUsing DPC++ ({}; {}) as SYCL backend with the kernel invocation type \"{}\".\n",
+                                      "\nUsing DPC++ ({}; {}) as SYCL backend with the data parallel kernel \"{}\".\n",
                                       detail::get_dpcpp_version(),
                                       detail::get_dpcpp_timestamp_version(),
-                                      invocation_type_);
+                                      data_parallel_kernel_type_);
         if (target == target_platform::automatic) {
             plssvm::detail::log_untracked(verbosity_level::full,
                                           comm_,
@@ -147,7 +277,7 @@ void csvm::init(const target_platform target) {
                                           "  [{}, {}]\n",
                                           device,
                                           trimmed_device_name);
-            device_names.emplace_back(device_name);
+            device_names.emplace_back(trimmed_device_name);
         }
     }
 
@@ -159,7 +289,7 @@ void csvm::init(const target_platform target) {
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "dependencies", "dpcpp_timestamp_version", detail::get_dpcpp_timestamp_version() }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "backend", plssvm::backend_type::sycl }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_implementation_type", plssvm::sycl::implementation_type::dpcpp }));
-    PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_kernel_invocation_type", invocation_type_ }));
+    PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_data_parallel_kernel", data_parallel_kernel_type_ }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "target_platform", target_ }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "num_devices", devices_.size() }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "device", device_names }));
@@ -223,10 +353,10 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id)
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const std::size_t num_rows_reduced = data_d.shape().x - 1;
     const std::size_t num_features = data_d.shape().y;
-    const queue_type &device = devices_[device_id];
+    const auto &device = devices_[device_id];
 
     // calculate the number of data points this device is responsible for
     const std::size_t device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
@@ -238,225 +368,28 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_assembly<kernel_function_type::linear>{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
                 break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::data_parallel_kernel::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::data_parallel_kernel::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::data_parallel_kernel::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -479,33 +412,20 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                     sycl::detail::basic::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                     sycl::detail::work_group::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
+            case sycl::data_parallel_kernel::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+            case sycl::data_parallel_kernel::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
-                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
 
@@ -513,33 +433,20 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
         const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
 
         if (num_mirror_rows > 0) {
-            switch (invocation_type_) {
-                case sycl::kernel_invocation_type::automatic:
-                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                case sycl::kernel_invocation_type::basic:
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, mirror_exec.block),
-                                         sycl::detail::basic::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::work_group:
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, mirror_exec.block),
-                                         sycl::detail::work_group::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
+                case sycl::data_parallel_kernel::work_group:
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, mirror_exec.block);
-                        cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
-#else
-                    throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+                case sycl::data_parallel_kernel::hierarchical:
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::scoped:
-                    throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                case sycl::data_parallel_kernel::scoped:
+                    throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
             }
         }
     }
@@ -554,33 +461,33 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss
     const queue_type &device = devices_[device_id];
 
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::basic>(partial_grid_ref, exec.block),
                                      sycl::detail::basic::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::work_group>(partial_grid_ref, exec.block),
                                      sycl::detail::work_group::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::hierarchical>(partial_grid_ref, exec.block);
                     cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
 #endif
                 break;
-            case sycl::kernel_invocation_type::scoped:
-                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -591,33 +498,33 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm:
     const queue_type &device = devices_[device_id];
 
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::basic>(partial_grid_ref, exec.block),
                                      sycl::detail::basic::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::work_group>(partial_grid_ref, exec.block),
                                      sycl::detail::work_group::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::hierarchical>(partial_grid_ref, exec.block);
                     cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
 #endif
                 break;
-            case sycl::kernel_invocation_type::scoped:
-                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -638,220 +545,21 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::linear>{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
                 break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::data_parallel_kernel::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::data_parallel_kernel::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::data_parallel_kernel::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -878,33 +586,20 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                     sycl::detail::basic::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                     sycl::detail::work_group::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
+            case sycl::data_parallel_kernel::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+            case sycl::data_parallel_kernel::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
-                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -926,220 +621,38 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
+        if (params.kernel_type == kernel_function_type::linear) {
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::work_group:
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::hierarchical:
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::scoped:
+                    throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
+            }
+        } else {
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::work_group:
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::hierarchical:
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::data_parallel_kernel::scoped:
+                    throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
+            }
         }
     }
     detail::device_synchronize(device);
diff --git a/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp b/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp
index c24b84407..456102d02 100644
--- a/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp
@@ -23,17 +23,21 @@
 namespace plssvm::dpcpp::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const queue &q) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q } { }
+device_ptr<T>::device_ptr(const size_type size, const queue &q, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const queue &q) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, q } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const queue &q, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, plssvm::shape padding, const queue &q) :
-    base_type{ shape, padding, q } {
-    data_ = ::sycl::malloc_device<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue &q, const bool use_usm_allocations) :
+    base_type{ shape, padding, q, use_usm_allocations } {
+    if (use_usm_allocations_) {
+        data_ = ::sycl::malloc_shared<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+    } else {
+        data_ = ::sycl::malloc_device<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+    }
     this->memset(0);
 }
 
diff --git a/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp b/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp
index 28742b23f..6f14f9271 100644
--- a/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp
@@ -10,6 +10,7 @@
 
 #include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp"       // plssvm::adaptivecpp::detail::queue
 #include "plssvm/backends/SYCL/DPCPP/detail/queue_impl.hpp"  // plssvm::dpcpp::detail::queue (PImpl implementation)
+#include "plssvm/detail/assert.hpp"                          // PLSSVM_ASSERT
 #include "plssvm/detail/string_utility.hpp"                  // plssvm::detail::{as_lower_case, contains}
 #include "plssvm/detail/utility.hpp"                         // plssvm::detail::contains
 #include "plssvm/exceptions/exceptions.hpp"                  // plssvm::platform_devices_empty
@@ -101,9 +102,11 @@ void device_synchronize(const queue &q) {
 }
 
 queue get_default_queue() {
-    queue q;
-    q.impl = std::make_shared<queue::queue_impl>();
-    return q;
+     const auto &[devices, target] = detail::get_device_list(determine_default_target_platform());
+     // at least one platform must be present
+     PLSSVM_ASSERT(!devices.empty(), "At least one device must be available!");
+     // per default, use the first device for the tests
+     return devices.front();
 }
 
 std::string get_dpcpp_version() {
diff --git a/src/plssvm/backends/SYCL/kernel_invocation_types.cpp b/src/plssvm/backends/SYCL/data_parallel_kernels.cpp
similarity index 52%
rename from src/plssvm/backends/SYCL/kernel_invocation_types.cpp
rename to src/plssvm/backends/SYCL/data_parallel_kernels.cpp
index 87ee18f26..ed098d335 100644
--- a/src/plssvm/backends/SYCL/kernel_invocation_types.cpp
+++ b/src/plssvm/backends/SYCL/data_parallel_kernels.cpp
@@ -6,7 +6,7 @@
  *          See the LICENSE.md file in the project root for full license information.
  */
 
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"
 
 #include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
 
@@ -18,52 +18,52 @@
 
 namespace plssvm::sycl {
 
-std::vector<kernel_invocation_type> list_available_sycl_kernel_invocation_types() {
-    std::vector<kernel_invocation_type> available_sycl_kernel_invocation_types = {
-        kernel_invocation_type::automatic,
-        kernel_invocation_type::basic,
-        kernel_invocation_type::work_group
+std::vector<data_parallel_kernel> list_available_sycl_data_parallel_kernels() {
+    std::vector<data_parallel_kernel> available_sycl_data_parallel_kernels = {
+        data_parallel_kernel::automatic,
+        data_parallel_kernel::basic,
+        data_parallel_kernel::work_group
     };
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-    available_sycl_kernel_invocation_types.push_back(kernel_invocation_type::hierarchical);
+    available_sycl_data_parallel_kernels.push_back(data_parallel_kernel::hierarchical);
     #if defined(PLSSVM_SYCL_BACKEND_HAS_ADAPTIVECPP)
-    available_sycl_kernel_invocation_types.push_back(kernel_invocation_type::scoped);
+    available_sycl_data_parallel_kernels.push_back(data_parallel_kernel::scoped);
     #endif
 #endif
-    return available_sycl_kernel_invocation_types;
+    return available_sycl_data_parallel_kernels;
 }
 
-std::ostream &operator<<(std::ostream &out, const kernel_invocation_type invocation) {
-    switch (invocation) {
-        case kernel_invocation_type::automatic:
+std::ostream &operator<<(std::ostream &out, const data_parallel_kernel kernel_type) {
+    switch (kernel_type) {
+        case data_parallel_kernel::automatic:
             return out << "automatic";
-        case kernel_invocation_type::basic:
+        case data_parallel_kernel::basic:
             return out << "basic";
-        case kernel_invocation_type::work_group:
+        case data_parallel_kernel::work_group:
             return out << "work_group";
-        case kernel_invocation_type::hierarchical:
+        case data_parallel_kernel::hierarchical:
             return out << "hierarchical";
-        case kernel_invocation_type::scoped:
+        case data_parallel_kernel::scoped:
             return out << "scoped";
     }
     return out << "unknown";
 }
 
-std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation) {
+std::istream &operator>>(std::istream &in, data_parallel_kernel &kernel_type) {
     std::string str;
     in >> str;
     detail::to_lower_case(str);
 
     if (str == "automatic" || str == "auto") {
-        invocation = kernel_invocation_type::automatic;
+        kernel_type = data_parallel_kernel::automatic;
     } else if (str == "basic") {
-        invocation = kernel_invocation_type::basic;
+        kernel_type = data_parallel_kernel::basic;
     } else if (str == "work_group" || str == "work-group" || str == "nd_range" || str == "nd-range") {
-        invocation = kernel_invocation_type::work_group;
+        kernel_type = data_parallel_kernel::work_group;
     } else if (str == "hierarchical") {
-        invocation = kernel_invocation_type::hierarchical;
+        kernel_type = data_parallel_kernel::hierarchical;
     } else if (str == "scoped") {
-        invocation = kernel_invocation_type::scoped;
+        kernel_type = data_parallel_kernel::scoped;
     } else {
         in.setstate(std::ios::failbit);
     }
diff --git a/src/plssvm/backends/stdpar/IntelLLVM/csvm.cpp b/src/plssvm/backends/stdpar/IntelLLVM/csvm.cpp
index 0ec757d8a..c8f4ec58f 100644
--- a/src/plssvm/backends/stdpar/IntelLLVM/csvm.cpp
+++ b/src/plssvm/backends/stdpar/IntelLLVM/csvm.cpp
@@ -14,6 +14,7 @@
 #include "plssvm/backends/stdpar/implementation_types.hpp"  // plssvm::stdpar::implementation_type
 #include "plssvm/detail/logging/log.hpp"                    // plssvm::detail::log
 #include "plssvm/detail/logging/log_untracked.hpp"          // plssvm::detail::log_untracked
+#include "plssvm/detail/string_utility.hpp"                 // plssvm::detail::trim
 #include "plssvm/detail/tracking/performance_tracker.hpp"   // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY
 #include "plssvm/target_platforms.hpp"                      // plssvm::target_platform
 #include "plssvm/verbosity_levels.hpp"                      // plssvm::verbosity_level
@@ -66,7 +67,7 @@ csvm::csvm(const target_platform target) {
                                              target_) };
     }
 
-    const std::vector<std::string> device_names{ default_device.get_info<::sycl::info::device::name>() };
+    const std::vector<std::string> device_names{ std::string{ plssvm::detail::trim(default_device.get_info<::sycl::info::device::name>()) } };
 
     if (comm_.size() > 1) {
         mpi::detail::gather_and_print_csvm_information(comm_, plssvm::backend_type::stdpar, target_, device_names, fmt::format("{}", this->get_implementation_type()));
diff --git a/src/plssvm/backends/stdpar/NVHPC/csvm.cpp b/src/plssvm/backends/stdpar/NVHPC/csvm.cpp
index da8286d27..6140a2631 100644
--- a/src/plssvm/backends/stdpar/NVHPC/csvm.cpp
+++ b/src/plssvm/backends/stdpar/NVHPC/csvm.cpp
@@ -72,7 +72,11 @@ csvm::csvm(const target_platform target) {
         plssvm::detail::log_untracked(verbosity_level::full,
                                       comm_,
                                       "\nUsing stdpar ({}; {}) as backend.\n"
+#if defined(PLSSVM_STDPAR_BACKEND_NVHPC_GPU)
                                       "Found {} stdpar device(s) for the target platform {}:\n",
+#else
+                                      "Found {} stdpar device(s) for the target platform {}.\n",
+#endif
                                       this->get_implementation_type(),
                                       detail::get_stdpar_version(),
                                       this->num_available_devices(),
diff --git a/src/plssvm/backends/stdpar/csvm.cpp b/src/plssvm/backends/stdpar/csvm.cpp
index aa7b32a0f..a6f2c2d08 100644
--- a/src/plssvm/backends/stdpar/csvm.cpp
+++ b/src/plssvm/backends/stdpar/csvm.cpp
@@ -8,6 +8,7 @@
 
 #include "plssvm/backends/stdpar/csvm.hpp"
 
+#include "plssvm/backends/stdpar/exceptions.hpp"                                      // plssvm::stdpar::backend_exception
 #include "plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp"                         // plssvm::stdpar::detail::device_kernel_symm
 #include "plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::stdpar::detail::device_kernel_assembly
 #include "plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::stdpar::detail::device_kernel_assembly_symm
@@ -15,6 +16,7 @@
 #include "plssvm/constants.hpp"                                                       // plssvm::real_type
 #include "plssvm/detail/assert.hpp"                                                   // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
+#include "plssvm/detail/make_unique_for_overwrite.hpp"                                // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset}
 #include "plssvm/detail/memory_size.hpp"                                              // plssvm::detail::memory_size
 #include "plssvm/detail/move_only_any.hpp"                                            // plssvm::detail::{move_only_any, move_only_any_cast}
 #include "plssvm/detail/tracking/performance_tracker.hpp"                             // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY
@@ -30,9 +32,111 @@
 #include <chrono>   // std::chrono::{steady_clock, duration_cast}
 #include <cstddef>  // std::size_t
 #include <tuple>    // std::tuple, std::make_tuple
-#include <utility>  // std::move
+#include <utility>  // std::move, std::forward
 #include <vector>   // std::vector
 
+namespace {
+
+/**
+ * @brief Run the kernel functor on the given device.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <typename KernelFunctor, typename... Args>
+void run_kernel_functor(Args &&...args) {
+    KernelFunctor{}(std::forward<Args>(args)...);
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam target the target platform to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, plssvm::target_platform target, typename... Args>
+void dispatch_kernel_function_type(const plssvm::parameter &params, Args &&...args) {
+    switch (params.kernel_type) {
+        case plssvm::kernel_function_type::linear:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::linear>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::kernel_function_type::polynomial:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::rbf:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::rbf, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::sigmoid:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::laplacian:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::laplacian, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::chi_squared:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::chi_squared, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch KernelFunctor kernel functor to the correct target platform and kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, const plssvm::parameter &params, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::stdpar::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_nvidia>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_amd>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_intel>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::cpu>(params, std::forward<Args>(args)...);
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, typename...> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::stdpar::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_nvidia>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_amd>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_intel>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::cpu>>(std::forward<Args>(args)...);
+            break;
+    }
+}
+
+}  // namespace
+
 namespace plssvm::stdpar {
 
 csvm::~csvm() = default;
@@ -71,6 +175,7 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
                 // unreachable
                 break;
             case solver_type::cg_explicit:
+            case solver_type::cg_streaming:
                 {
                     // calculate the number of data points this device is responsible for
                     const std::size_t device_specific_num_rows = dist.place_specific_num_rows(0);
@@ -78,28 +183,16 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
                     // get the offset of the data points this device is responsible for
                     const std::size_t row_offset = dist.place_row_offset(0);
 
-                    std::vector<real_type> kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));  // only explicitly store the upper triangular matrix
+                    // get the number of kernel matrix entries
+                    const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0);
+
+                    // only explicitly store the upper triangular matrix
+                    auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite<real_type[]>(num_entries);
+                    // initialize kernel matrix to all zeros in parallel
+                    ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries);
+
                     const auto start = std::chrono::steady_clock::now();
-                    switch (params.kernel_type) {
-                        case kernel_function_type::linear:
-                            detail::device_kernel_assembly<kernel_function_type::linear>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
-                            break;
-                        case kernel_function_type::polynomial:
-                            detail::device_kernel_assembly<kernel_function_type::polynomial>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<real_type>(params.gamma), params.coef0);
-                            break;
-                        case kernel_function_type::rbf:
-                            detail::device_kernel_assembly<kernel_function_type::rbf>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
-                            break;
-                        case kernel_function_type::sigmoid:
-                            detail::device_kernel_assembly<kernel_function_type::sigmoid>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma), params.coef0);
-                            break;
-                        case kernel_function_type::laplacian:
-                            detail::device_kernel_assembly<kernel_function_type::laplacian>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
-                            break;
-                        case kernel_function_type::chi_squared:
-                            detail::device_kernel_assembly<kernel_function_type::chi_squared>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
-                            break;
-                    }
+                    dispatch_target_platform<detail::device_kernel_assembly>(target_, params, kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
                     const auto end = std::chrono::steady_clock::now();
                     [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "kernel_matrix", "kernel_matrix_assembly_kernel", duration }));
@@ -154,17 +247,18 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                 // unreachable
                 break;
             case solver_type::cg_explicit:
+            case solver_type::cg_streaming:
                 {
-                    const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::vector<real_type> &>(A.front());
-                    PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!");
+                    const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::unique_ptr<real_type[]> &>(A.front());
+                    PLSSVM_ASSERT(explicit_A != nullptr, "The A matrix must not be empty!");
 
                     const auto start = std::chrono::steady_clock::now();
 
-                    detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                    dispatch_target_platform<detail::device_kernel_symm>(target_, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
 
                     const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
                     if (num_mirror_rows > std::size_t{ 0 }) {
-                        detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                        dispatch_target_platform<detail::device_kernel_symm_mirror>(target_, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
                     }
 
                     const auto end = std::chrono::steady_clock::now();
@@ -186,26 +280,7 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                     }
 
                     const auto start = std::chrono::steady_clock::now();
-                    switch (params.kernel_type) {
-                        case kernel_function_type::linear:
-                            detail::device_kernel_assembly_symm<kernel_function_type::linear>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C);
-                            break;
-                        case kernel_function_type::polynomial:
-                            detail::device_kernel_assembly_symm<kernel_function_type::polynomial>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C, params.degree, std::get<real_type>(params.gamma), params.coef0);
-                            break;
-                        case kernel_function_type::rbf:
-                            detail::device_kernel_assembly_symm<kernel_function_type::rbf>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<real_type>(params.gamma));
-                            break;
-                        case kernel_function_type::sigmoid:
-                            detail::device_kernel_assembly_symm<kernel_function_type::sigmoid>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<real_type>(params.gamma), params.coef0);
-                            break;
-                        case kernel_function_type::laplacian:
-                            detail::device_kernel_assembly_symm<kernel_function_type::laplacian>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<real_type>(params.gamma));
-                            break;
-                        case kernel_function_type::chi_squared:
-                            detail::device_kernel_assembly_symm<kernel_function_type::chi_squared>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<real_type>(params.gamma));
-                            break;
-                    }
+                    dispatch_target_platform<detail::device_kernel_assembly_symm>(target_, params, alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C);
                     const auto end = std::chrono::steady_clock::now();
                     [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "cg", "blas_level_3_times_kernel", duration }));
@@ -213,6 +288,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                 break;
         }
     }
+    // restore padding entries by setting them to zero
+    C.restore_padding();
 }
 
 //***************************************************//
@@ -262,12 +339,14 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
 
                 const auto start = std::chrono::steady_clock::now();
 
-                detail::device_kernel_w_linear(w, alpha, support_vectors, device_specific_num_sv, sv_offset);
+                dispatch_target_platform<detail::device_kernel_w_linear>(target_, w, alpha, support_vectors, device_specific_num_sv, sv_offset);
 
                 const auto end = std::chrono::steady_clock::now();
                 [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration }));
             }
+            // restore padding entries by setting them to zero
+            w.restore_padding();
 
             // reduce w on all MPI ranks
             comm_.allreduce_inplace(w);
@@ -281,32 +360,17 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
     if (data_distribution_->place_specific_num_rows(0) > std::size_t{ 0 }) {
         const auto start = std::chrono::steady_clock::now();
         // call the predict kernels
-        switch (params.kernel_type) {
-            case kernel_function_type::linear:
-                // predict the values using the w vector
-                detail::device_kernel_predict_linear(out, w, rho, predict_points, device_specific_num_predict_points, row_offset);
-                break;
-            case kernel_function_type::polynomial:
-                detail::device_kernel_predict<kernel_function_type::polynomial>(out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset, params.degree, std::get<real_type>(params.gamma), params.coef0);
-                break;
-            case kernel_function_type::rbf:
-                detail::device_kernel_predict<kernel_function_type::rbf>(out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset, std::get<real_type>(params.gamma));
-                break;
-            case kernel_function_type::sigmoid:
-                detail::device_kernel_predict<kernel_function_type::sigmoid>(out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset, std::get<real_type>(params.gamma), params.coef0);
-                break;
-            case kernel_function_type::laplacian:
-                detail::device_kernel_predict<kernel_function_type::laplacian>(out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset, std::get<real_type>(params.gamma));
-                break;
-            case kernel_function_type::chi_squared:
-                detail::device_kernel_predict<kernel_function_type::chi_squared>(out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset, std::get<real_type>(params.gamma));
-                break;
+        if (params.kernel_type == kernel_function_type::linear) {
+            dispatch_target_platform<detail::device_kernel_predict_linear>(target_, out, w, rho, predict_points, device_specific_num_predict_points, row_offset);
+        } else {
+            dispatch_target_platform<detail::device_kernel_predict>(target_, params, out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset);
         }
         const auto end = std::chrono::steady_clock::now();
         [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "predict_kernel", duration }));
     }
-
+    // restore padding entries by setting them to zero
+    out.restore_padding();
     return out;
 }
 
diff --git a/src/plssvm/detail/cmd/parser_predict.cpp b/src/plssvm/detail/cmd/parser_predict.cpp
index ed4476e96..23585123d 100644
--- a/src/plssvm/detail/cmd/parser_predict.cpp
+++ b/src/plssvm/detail/cmd/parser_predict.cpp
@@ -8,20 +8,22 @@
 
 #include "plssvm/detail/cmd/parser_predict.hpp"
 
-#include "plssvm/backend_types.hpp"                          // plssvm::list_available_backends
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::list_available_execution_spaces
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::list_available_sycl_implementations
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::{list_available_sycl_kernel_invocation_types, kernel_invocation_type}
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/detail/assert.hpp"                          // PLSSVM_ASSERT
-#include "plssvm/detail/logging/mpi_log_untracked.hpp"       // plssvm::detail::log_untracked
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::cmd_parser_exit
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/target_platforms.hpp"                       // plssvm::list_available_target_platforms
-#include "plssvm/verbosity_levels.hpp"                       // plssvm::verbosity, plssvm::verbosity_level
-#include "plssvm/version/version.hpp"                        // plssvm::version::detail::get_version_info
-
-#include "cxxopts.hpp"   // cxxopts::{Options, value, ParseResult}
+#include "plssvm/backend_types.hpp"                        // plssvm::list_available_backends
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::{list_available_execution_spaces, execution_space}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::{list_available_sycl_data_parallel_kernels, data_parallel_kernels}
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::{list_available_sycl_implementations, implementation_type}
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
+#include "plssvm/detail/cmd/utility.hpp"                   // plssvm::detail::cmd::{filter_argv, kernel_type_help_message, parse_and_check_sycl_options_if_available,
+                                                           // parse_and_check_kokkos_options_if_available, parse_and_check_mpi_options_if_available, parse_verbosity}
+#include "plssvm/detail/logging/mpi_log_untracked.hpp"     // plssvm::detail::log_untracked
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform, plssvm::list_available_target_platforms
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity, plssvm::verbosity_level
+#include "plssvm/version/version.hpp"                      // plssvm::version::detail::get_version_info
+
+#include "cxxopts.hpp"   // cxxopts::Options, cxxopts::value, cxxopts::ParseResult
 #include "fmt/color.h"   // fmt::fg, fmt::color::orange
 #include "fmt/format.h"  // fmt::format
 #include "fmt/ranges.h"  // fmt::join
@@ -30,7 +32,9 @@
 #include <exception>    // std::exception
 #include <filesystem>   // std::filesystem::path
 #include <iostream>     // std::cout, std::cerr, std::endl
+#include <optional>     // std::optional
 #include <type_traits>  // std::is_same_v
+#include <utility>      // std::pair, std::move
 #include <vector>       // std::vector
 
 namespace plssvm::detail::cmd {
@@ -40,6 +44,9 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
     PLSSVM_ASSERT(argc >= 1, fmt::format("At least one argument is always given (the executable name), but argc is {}!", argc));
     PLSSVM_ASSERT(argv != nullptr, "At least one argument is always given (the executable name), but argv is a nullptr!");
 
+    // filter the command line arguments removing third party options
+    std::vector<char *> filtered_args = filter_argv(argc, argv);
+
     // setup command line parser with all available options
     cxxopts::Options options("plssvm-predict", "LS-SVM with multiple (GPU-)backends");
     options
@@ -53,7 +60,7 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
             ("b,backend", fmt::format("choose the backend: {}", fmt::join(list_available_backends(), "|")), cxxopts::value<backend_type>()->default_value(fmt::format("{}", backend)))
             ("p,target_platform", fmt::format("choose the target platform: {}", fmt::join(list_available_target_platforms(), "|")), cxxopts::value<target_platform>()->default_value(fmt::format("{}", target)))
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-            ("sycl_kernel_invocation_type", fmt::format("choose the kernel invocation type when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_kernel_invocation_types(), "|")), cxxopts::value<decltype(sycl_kernel_invocation_type)>()->default_value(fmt::format("{}", sycl_kernel_invocation_type)))
+            ("sycl_data_parallel_kernel", fmt::format("choose the data parallel kernel when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_data_parallel_kernels(), "|")), cxxopts::value<decltype(sycl_data_parallel_kernel)>()->default_value(fmt::format("{}", sycl_data_parallel_kernel)))
             ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value<sycl::implementation_type>()->default_value(fmt::format("{}", sycl_implementation_type)))
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -79,7 +86,7 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
     cxxopts::ParseResult result;
     try {
         options.parse_positional({ "test", "model", "output" });
-        result = options.parse(argc, argv);
+        result = options.parse(static_cast<int>(filtered_args.size()), filtered_args.data());
     } catch (const std::exception &e) {
         if (comm.is_main_rank()) {
             std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: {}\n", e.what()) << std::endl;
@@ -119,75 +126,26 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
     // parse target_platform and cast the value to the respective enum
     target = result["target_platform"].as<decltype(target)>();
 
-#if defined(PLSSVM_HAS_SYCL_BACKEND)
-    {
-        // parse kernel invocation type when using SYCL as backend
-        sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as<decltype(sycl_kernel_invocation_type)>();
-
-        // assemble warning condition
-        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
-        const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
-
-        // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend
-        if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n",
-                                  sycl_kernel_invocation_type);
-        }
-
-        // parse SYCL implementation used in the SYCL backend
-        sycl_implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
-
-        // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
-        if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
-                                  sycl_implementation_type);
-        }
+    // parse the SYCL related options
+    const std::optional<std::pair<sycl::data_parallel_kernel, sycl::implementation_type>> sycl_options = parse_and_check_sycl_options_if_available(result, comm, backend, target);
+    if (sycl_options.has_value()) {
+        sycl_data_parallel_kernel = sycl_options->first;
+        sycl_implementation_type = sycl_options->second;
     }
-#endif
-
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    {
-        // parse execution space when using Kokkos as backend
-        kokkos_execution_space = result["kokkos_execution_space"].as<decltype(kokkos_execution_space)>();
 
-        // assemble warning condition
-        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
-        const bool kokkos_backend_is_used = backend == backend_type::kokkos || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::kokkos);
-
-        // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend
-        if (!kokkos_backend_is_used && kokkos_execution_space != kokkos::execution_space::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n",
-                                  kokkos_execution_space);
-        }
+    // parse the Kokkos related options
+    const std::optional<kokkos::execution_space> kokkos_options = parse_and_check_kokkos_options_if_available(result, comm, backend, target);
+    if (kokkos_options.has_value()) {
+        kokkos_execution_space = kokkos_options.value();
     }
-#endif
 
     // parse whether strings should be used as labels
     strings_as_labels = result["use_strings_as_labels"].as<decltype(strings_as_labels)>();
 
-    // parse whether output is quiet or not
-    const bool quiet = result["quiet"].as<bool>();
-
     // -q/--quiet has precedence over --verbosity
-    if (result["verbosity"].count()) {
-        const verbosity_level verb = result["verbosity"].as<verbosity_level>();
-        if (quiet && verb != verbosity_level::quiet) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set the -q/--quiet flag, but the provided verbosity level isn't \"quiet\"; setting --verbosity={} to --verbosity=quiet\n",
-                                  verb);
-            verbosity = verbosity_level::quiet;
-        } else {
-            verbosity = verb;
-        }
-    } else if (quiet) {
-        verbosity = verbosity_level::quiet;
+    const std::optional<verbosity_level> verb = parse_verbosity(result, comm);
+    if (verb.has_value()) {
+        verbosity = verb.value();
     }
 
     // parse test data filename
@@ -218,28 +176,16 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
         predict_filename = input_path.filename().string() + ".predict";
     }
 
-#if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
     // parse performance tracking filename
     if (result.count("performance_tracking")) {
         performance_tracking_filename = result["performance_tracking"].as<decltype(performance_tracking_filename)>();
     }
-#endif
-
-#if defined(PLSSVM_HAS_MPI_ENABLED)
-    // parse MPI load balancing factors
-    if (result.count("mpi_load_balancing_weights")) {
-        mpi_load_balancing_weights = result["mpi_load_balancing_weights"].as<decltype(mpi_load_balancing_weights)>();
 
-        // sanity check provided balance factors
-        if (mpi_load_balancing_weights.size() != comm.size()) {
-            if (comm.is_main_rank()) {
-                std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: the number of load balancing weights ({}) must match the number of MPI ranks ({})!\n", mpi_load_balancing_weights.size(), comm.size()) << std::endl;
-                std::cout << options.help() << std::endl;
-            }
-            throw cmd_parser_exit{ EXIT_FAILURE };
-        }
+    // parse the MPI related options
+    std::optional<std::vector<std::size_t>> mpi_options = parse_and_check_mpi_options_if_available(result, options, comm);
+    if (mpi_options.has_value()) {
+        mpi_load_balancing_weights = std::move(mpi_options.value());
     }
-#endif
 }
 
 std::ostream &operator<<(std::ostream &out, const parser_predict &params) {
@@ -252,9 +198,9 @@ std::ostream &operator<<(std::ostream &out, const parser_predict &params) {
     if (params.backend == backend_type::sycl || params.backend == backend_type::automatic) {
         out << fmt::format(
             "SYCL implementation type: {}\n"
-            "SYCL kernel invocation type: {}\n",
+            "SYCL data parallel kernel: {}\n",
             params.sycl_implementation_type,
-            params.sycl_kernel_invocation_type);
+            params.sycl_data_parallel_kernel);
     }
 
     if (params.backend == backend_type::kokkos || params.backend == backend_type::automatic) {
diff --git a/src/plssvm/detail/cmd/parser_train.cpp b/src/plssvm/detail/cmd/parser_train.cpp
index b47422a0f..3bb47fb0f 100644
--- a/src/plssvm/detail/cmd/parser_train.cpp
+++ b/src/plssvm/detail/cmd/parser_train.cpp
@@ -8,26 +8,26 @@
 
 #include "plssvm/detail/cmd/parser_train.hpp"
 
-#include "plssvm/backend_types.hpp"                          // plssvm::list_available_backends, plssvm::determine_default_backend
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::{list_available_execution_spaces, execution_space}
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::{list_available_sycl_implementations, implementation_type}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::{list_available_sycl_kernel_invocation_types, kernel_invocation_type}
-#include "plssvm/classification_types.hpp"                   // plssvm::classification_type, plssvm::classification_type_to_full_string
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/detail/assert.hpp"                          // PLSSVM_ASSERT
-#include "plssvm/detail/logging/mpi_log_untracked.hpp"       // plssvm::detail::log_untracked
-#include "plssvm/detail/utility.hpp"                         // plssvm::detail::to_underlying
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::cmd_parser_exit
-#include "plssvm/gamma.hpp"                                  // plssvm::get_gamma_string
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_type_to_math_string
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/mpi/environment.hpp"                        // plssvm::mpi::{is_active, finalize}
-#include "plssvm/svm_types.hpp"                              // plssvm::svm_type
-#include "plssvm/target_platforms.hpp"                       // plssvm::list_available_target_platforms
-#include "plssvm/verbosity_levels.hpp"                       // plssvm::verbosity, plssvm::verbosity_level
-#include "plssvm/version/version.hpp"                        // plssvm::version::detail::get_version_info
-
-#include "cxxopts.hpp"   // cxxopts::Options, cxxopts::value,cxxopts::ParseResult
+#include "plssvm/backend_types.hpp"                        // plssvm::list_available_backends
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::{list_available_execution_spaces, execution_space}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::{list_available_sycl_data_parallel_kernels, data_parallel_kernels}
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::{list_available_sycl_implementations, implementation_type}
+#include "plssvm/classification_types.hpp"                 // plssvm::classification_type, plssvm::classification_type_to_full_string
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
+#include "plssvm/detail/cmd/utility.hpp"                   // plssvm::detail::cmd::{filter_argv, kernel_type_help_message, parse_and_check_sycl_options_if_available,
+                                                           // parse_and_check_kokkos_options_if_available, parse_and_check_mpi_options_if_available, parse_verbosity}
+#include "plssvm/detail/logging/mpi_log_untracked.hpp"     // plssvm::detail::log_untracked
+#include "plssvm/detail/utility.hpp"                       // plssvm::detail::to_underlying
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
+#include "plssvm/gamma.hpp"                                // plssvm::get_gamma_string
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/svm_types.hpp"                            // plssvm::svm_type
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform, plssvm::list_available_target_platforms
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity, plssvm::verbosity_level
+#include "plssvm/version/version.hpp"                      // plssvm::version::detail::get_version_info
+
+#include "cxxopts.hpp"   // cxxopts::Options, cxxopts::value, cxxopts::ParseResult
 #include "fmt/color.h"   // fmt::fg, fmt::color::red
 #include "fmt/format.h"  // fmt::format
 #include "fmt/ranges.h"  // fmt::join
@@ -36,8 +36,10 @@
 #include <exception>    // std::exception
 #include <filesystem>   // std::filesystem::path
 #include <iostream>     // std::cout, std::cerr, std::endl
+#include <optional>     // std::optional
 #include <string>       // std::string
 #include <type_traits>  // std::is_same_v
+#include <utility>      // std::pair, std::move
 #include <variant>      // std::holds_alternative, std::get
 #include <vector>       // std::vector
 
@@ -48,15 +50,11 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
     PLSSVM_ASSERT(argc >= 1, fmt::format("At least one argument is always given (the executable name), but argc is {}!", argc));
     PLSSVM_ASSERT(argv != nullptr, "At least one argument is always given (the executable name), but argv is a nullptr!");
 
+    // filter the command line arguments removing third party options
+    std::vector<char *> filtered_args = filter_argv(argc, argv);
+
     // create the help message for the kernel function type
-    const auto kernel_type_to_help_entry = [](const kernel_function_type kernel) {
-        return fmt::format("\t {} -- {}: {}\n", detail::to_underlying(kernel), kernel, kernel_function_type_to_math_string(kernel));
-    };
-    std::string kernel_type_help{ "set type of kernel function. \n" };
-    for (const kernel_function_type kernel : { kernel_function_type::linear, kernel_function_type::polynomial, kernel_function_type::rbf, kernel_function_type::sigmoid, kernel_function_type::laplacian, kernel_function_type::chi_squared }) {
-        kernel_type_help += kernel_type_to_help_entry(kernel);
-    }
-    kernel_type_help.pop_back();  // remove last newline character
+    const std::string kernel_type_help = kernel_type_help_message();
 
     cxxopts::Options options("plssvm-train", "LS-SVM with multiple (GPU-)backends");
     options
@@ -75,12 +73,12 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
            ("c,cost", "set the parameter C", cxxopts::value<decltype(csvm_params.cost)>()->default_value(fmt::format("{}", csvm_params.cost)))
            ("e,epsilon", "set the tolerance of termination criterion", cxxopts::value<decltype(epsilon)>()->default_value(fmt::format("{}", epsilon)))
            ("i,max_iter", "set the maximum number of CG iterations (default: num_features)", cxxopts::value<long long int>())
-           ("l,solver", "choose the solver: automatic|cg_explicit|cg_implicit", cxxopts::value<decltype(solver)>()->default_value("automatic"))
+           ("l,solver", "choose the solver: automatic|cg_explicit|cg_streaming|cg_implicit", cxxopts::value<decltype(solver)>()->default_value("automatic"))
            ("a,classification", "the classification strategy to use for multi-class classification: oaa|oao", cxxopts::value<decltype(classification)>()->default_value(fmt::format("{}", classification)))
            ("b,backend", fmt::format("choose the backend: {}", fmt::join(list_available_backends(), "|")), cxxopts::value<decltype(backend)>()->default_value(fmt::format("{}", backend)))
            ("p,target_platform", fmt::format("choose the target platform: {}", fmt::join(list_available_target_platforms(), "|")), cxxopts::value<decltype(target)>()->default_value(fmt::format("{}", target)))
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-           ("sycl_kernel_invocation_type", fmt::format("choose the kernel invocation type when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_kernel_invocation_types(), "|")), cxxopts::value<decltype(sycl_kernel_invocation_type)>()->default_value(fmt::format("{}", sycl_kernel_invocation_type)))
+           ("sycl_data_parallel_kernel", fmt::format("choose the data parallel kernel when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_data_parallel_kernels(), "|")), cxxopts::value<decltype(sycl_data_parallel_kernel)>()->default_value(fmt::format("{}", sycl_data_parallel_kernel)))
            ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value<decltype(sycl_implementation_type)>()->default_value(fmt::format("{}", sycl_implementation_type)))
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -105,7 +103,7 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
     cxxopts::ParseResult result;
     try {
         options.parse_positional({ "input", "model" });
-        result = options.parse(argc, argv);
+        result = options.parse(static_cast<int>(filtered_args.size()), filtered_args.data());
     } catch (const std::exception &e) {
         if (comm.is_main_rank()) {
             std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: {}\n", e.what()) << std::endl;
@@ -221,54 +219,18 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
     // parse the solver_type and cast the value to the respective enum
     solver = result["solver"].as<decltype(solver)>();
 
-#if defined(PLSSVM_HAS_SYCL_BACKEND)
-    {
-        // parse kernel invocation type when using SYCL as backend
-        sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as<decltype(sycl_kernel_invocation_type)>();
-
-        // assemble warning condition
-        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
-        const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
-
-        // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend
-        if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n",
-                                  sycl_kernel_invocation_type);
-        }
-
-        // parse SYCL implementation used in the SYCL backend
-        sycl_implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
-
-        // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
-        if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
-                                  sycl_implementation_type);
-        }
+    // parse the SYCL related options
+    const std::optional<std::pair<sycl::data_parallel_kernel, sycl::implementation_type>> sycl_options = parse_and_check_sycl_options_if_available(result, comm, backend, target);
+    if (sycl_options.has_value()) {
+        sycl_data_parallel_kernel = sycl_options->first;
+        sycl_implementation_type = sycl_options->second;
     }
-#endif
 
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    {
-        // parse execution space when using Kokkos as backend
-        kokkos_execution_space = result["kokkos_execution_space"].as<decltype(kokkos_execution_space)>();
-
-        // assemble warning condition
-        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
-        const bool kokkos_backend_is_used = backend == backend_type::kokkos || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::kokkos);
-
-        // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend
-        if (!kokkos_backend_is_used && kokkos_execution_space != kokkos::execution_space::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n",
-                                  kokkos_execution_space);
-        }
+    // parse the Kokkos related options
+    const std::optional<kokkos::execution_space> kokkos_options = parse_and_check_kokkos_options_if_available(result, comm, backend, target);
+    if (kokkos_options.has_value()) {
+        kokkos_execution_space = kokkos_options.value();
     }
-#endif
 
     // parse whether strings should be used as labels for the classification task
     strings_as_labels = result["use_strings_as_labels"].as<decltype(strings_as_labels)>();
@@ -278,23 +240,10 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
                               "WARNING: explicitly requested string labels for the regression task; ignoring --use_strings_as_labels\n");
     }
 
-    // parse whether output is quiet or not
-    const bool quiet = result["quiet"].as<bool>();
-
     // -q/--quiet has precedence over --verbosity
-    if (result["verbosity"].count()) {
-        const verbosity_level verb = result["verbosity"].as<verbosity_level>();
-        if (quiet && verb != verbosity_level::quiet) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set the -q/--quiet flag, but the provided verbosity level isn't \"quiet\"; setting --verbosity={} to --verbosity=quiet\n",
-                                  verb);
-            verbosity = verbosity_level::quiet;
-        } else {
-            verbosity = verb;
-        }
-    } else if (quiet) {
-        verbosity = verbosity_level::quiet;
+    const std::optional<verbosity_level> verb = parse_verbosity(result, comm);
+    if (verb.has_value()) {
+        verbosity = verb.value();
     }
 
     // parse input data filename
@@ -315,28 +264,16 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
         model_filename = input_path.filename().string() + ".model";
     }
 
-#if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
     // parse performance tracking filename
     if (result.count("performance_tracking")) {
         performance_tracking_filename = result["performance_tracking"].as<decltype(performance_tracking_filename)>();
     }
-#endif
 
-#if defined(PLSSVM_HAS_MPI_ENABLED)
-    // parse MPI load balancing factors
-    if (result.count("mpi_load_balancing_weights")) {
-        mpi_load_balancing_weights = result["mpi_load_balancing_weights"].as<decltype(mpi_load_balancing_weights)>();
-
-        // sanity check provided balance factors
-        if (mpi_load_balancing_weights.size() != comm.size()) {
-            if (comm.is_main_rank()) {
-                std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: the number of load balancing weights ({}) must match the number of MPI ranks ({})!\n", mpi_load_balancing_weights.size(), comm.size()) << std::endl;
-                std::cout << options.help() << std::endl;
-            }
-            throw cmd_parser_exit{ EXIT_FAILURE };
-        }
+    // parse the MPI related options
+    std::optional<std::vector<std::size_t>> mpi_options = parse_and_check_mpi_options_if_available(result, options, comm);
+    if (mpi_options.has_value()) {
+        mpi_load_balancing_weights = std::move(mpi_options.value());
     }
-#endif
 }
 
 std::ostream &operator<<(std::ostream &out, const parser_train &params) {
@@ -387,9 +324,9 @@ std::ostream &operator<<(std::ostream &out, const parser_train &params) {
     if (params.backend == backend_type::sycl || params.backend == backend_type::automatic) {
         out << fmt::format(
             "SYCL implementation type: {}\n"
-            "SYCL kernel invocation type: {}\n",
+            "SYCL data parallel kernel: {}\n",
             params.sycl_implementation_type,
-            params.sycl_kernel_invocation_type);
+            params.sycl_data_parallel_kernel);
     }
 
     if (params.backend == backend_type::kokkos || params.backend == backend_type::automatic) {
diff --git a/src/plssvm/detail/cmd/utility.cpp b/src/plssvm/detail/cmd/utility.cpp
new file mode 100644
index 000000000..23c44797f
--- /dev/null
+++ b/src/plssvm/detail/cmd/utility.cpp
@@ -0,0 +1,177 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/detail/cmd/utility.hpp"
+
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type, plssvm::list_available_backends, plssvm::determine_default_backend
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernels
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/detail/logging/mpi_log_untracked.hpp"     // plssvm::detail::log_untracked
+#include "plssvm/detail/string_utility.hpp"                // plssvm::detail::starts_with
+#include "plssvm/detail/utility.hpp"                       // plssvm::detail::to_underlying
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform, plssvm::determine_default_target_platform
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity_level
+
+#include "cxxopts.hpp"   // cxxopts::ParseResult, cxxopts::Options
+#include "fmt/color.h"   // fmt::fg, fmt::color::red
+#include "fmt/format.h"  // fmt::format
+
+#include <cstddef>   // std::size_t
+#include <cstdlib>   // EXIT_FAILURE
+#include <iostream>  // std::cout, std::cerr, std::endl
+#include <optional>  // std::optional, std::nullopt
+#include <string>    // std::string
+#include <utility>   // std::pair, std::make_pair
+#include <vector>    // std::vector
+
+namespace plssvm::detail::cmd {
+
+std::vector<char *> filter_argv(int argc, char **argv, const std::vector<std::string> &prefix_filter) {
+    // We ignore all command line options starting with --hpx: like --hpx:threads=42.
+    // We also ignore all command line options starting with --kokkos-.
+    // NOTE: this does not include OUR command line option --kokkos_execution_space.
+    std::vector<char *> filtered_argv{ argv[0] };
+    for (std::size_t i = 1; i < static_cast<std::size_t>(argc); ++i) {
+        bool remove_option = false;
+
+        // check whether the current command line option starts with any of the provided prefixes
+        for (const std::string &prefix : prefix_filter) {
+            if (detail::starts_with(argv[i], prefix)) {
+                remove_option = true;
+            }
+        }
+
+        // only add the command line options that should not be removed
+        if (!remove_option) {
+            filtered_argv.push_back(argv[i]);
+        }
+    }
+
+    return filtered_argv;
+}
+
+std::string kernel_type_help_message() {
+    // create the help message for the kernel function type
+    const auto kernel_type_to_help_entry = [](const kernel_function_type kernel) {
+        return fmt::format("\t {} -- {}: {}\n", detail::to_underlying(kernel), kernel, kernel_function_type_to_math_string(kernel));
+    };
+    std::string kernel_type_help{ "set type of kernel function. \n" };
+    for (const kernel_function_type kernel : { kernel_function_type::linear, kernel_function_type::polynomial, kernel_function_type::rbf, kernel_function_type::sigmoid, kernel_function_type::laplacian, kernel_function_type::chi_squared }) {
+        kernel_type_help += kernel_type_to_help_entry(kernel);
+    }
+    kernel_type_help.pop_back();  // remove last newline character
+
+    return kernel_type_help;
+}
+
+std::optional<std::pair<sycl::data_parallel_kernel, sycl::implementation_type>> parse_and_check_sycl_options_if_available([[maybe_unused]] const cxxopts::ParseResult &result, [[maybe_unused]] const mpi::communicator &comm, [[maybe_unused]] const backend_type backend, [[maybe_unused]] const target_platform target) {
+#if defined(PLSSVM_HAS_SYCL_BACKEND)
+    // parse the data parallel kernel when using SYCL as backend
+    const sycl::data_parallel_kernel data_parallel_kernel = result["sycl_data_parallel_kernel"].as<sycl::data_parallel_kernel>();
+
+    // assemble warning condition
+    const std::vector<target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
+    const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
+
+    // warn if the data parallel kernel is explicitly set but SYCL isn't the current (automatic) backend
+    if (!sycl_backend_is_used && data_parallel_kernel != sycl::data_parallel_kernel::automatic) {
+        detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                              comm,
+                              "WARNING: explicitly set a SYCL data parallel kernel but the current backend isn't SYCL; ignoring --sycl_data_parallel_kernel={}\n",
+                              data_parallel_kernel);
+    }
+
+    // parse the SYCL implementation used in the SYCL backend
+    const sycl::implementation_type implementation_type = result["sycl_implementation_type"].as<sycl::implementation_type>();
+
+    // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
+    if (!sycl_backend_is_used && implementation_type != sycl::implementation_type::automatic) {
+        detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                              comm,
+                              "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
+                              implementation_type);
+    }
+
+    return std::make_pair(data_parallel_kernel, implementation_type);
+#else
+    return std::nullopt;
+#endif
+}
+
+[[nodiscard]] std::optional<kokkos::execution_space> parse_and_check_kokkos_options_if_available([[maybe_unused]] const cxxopts::ParseResult &result, [[maybe_unused]] const mpi::communicator &comm, [[maybe_unused]] const backend_type backend, [[maybe_unused]] const target_platform target) {
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    // parse execution space when using Kokkos as backend
+    const kokkos::execution_space execution_space = result["kokkos_execution_space"].as<kokkos::execution_space>();
+
+    // assemble warning condition
+    const std::vector<target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
+    const bool kokkos_backend_is_used = backend == backend_type::kokkos || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::kokkos);
+
+    // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend
+    if (!kokkos_backend_is_used && execution_space != kokkos::execution_space::automatic) {
+        detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                              comm,
+                              "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n",
+                              execution_space);
+    }
+
+    return execution_space;
+#else
+    return std::nullopt;
+#endif
+}
+
+std::optional<std::vector<std::size_t>> parse_and_check_mpi_options_if_available([[maybe_unused]] const cxxopts::ParseResult &result, [[maybe_unused]] const cxxopts::Options &options, [[maybe_unused]] const mpi::communicator &comm) {
+#if defined(PLSSVM_HAS_MPI_ENABLED)
+    // parse MPI load balancing factors
+    if (result.count("mpi_load_balancing_weights")) {
+        std::vector<std::size_t> mpi_load_balancing_weights = result["mpi_load_balancing_weights"].as<std::vector<std::size_t>>();
+
+        // sanity-check provided balance factors
+        if (mpi_load_balancing_weights.size() != comm.size()) {
+            if (comm.is_main_rank()) {
+                std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: the number of load balancing weights ({}) must match the number of MPI ranks ({})!\n", mpi_load_balancing_weights.size(), comm.size()) << std::endl;
+                std::cout << options.help() << std::endl;
+            }
+            throw cmd_parser_exit{ EXIT_FAILURE };
+        }
+        return std::make_optional(std::move(mpi_load_balancing_weights));
+    }
+    return std::nullopt;
+#else
+    return std::nullopt;
+#endif
+}
+
+std::optional<verbosity_level> parse_verbosity(const cxxopts::ParseResult &result, const mpi::communicator &comm) {
+    // parse whether output is quiet or not
+    const bool quiet = result["quiet"].as<bool>();
+
+    if (result["verbosity"].count()) {
+        const verbosity_level verb = result["verbosity"].as<verbosity_level>();
+        if (quiet && verb != verbosity_level::quiet) {
+            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  comm,
+                                  "WARNING: explicitly set the -q/--quiet flag, but the provided verbosity level isn't \"quiet\"; setting --verbosity={} to --verbosity=quiet\n",
+                                  verb);
+            return verbosity_level::quiet;
+        } else {
+            return verb;
+        }
+    } else if (quiet) {
+        return verbosity_level::quiet;
+    } else {
+        return std::nullopt;
+    }
+}
+
+}  // namespace plssvm::detail::cmd
diff --git a/src/plssvm/detail/data_distribution.cpp b/src/plssvm/detail/data_distribution.cpp
index 3f3e42678..221878153 100644
--- a/src/plssvm/detail/data_distribution.cpp
+++ b/src/plssvm/detail/data_distribution.cpp
@@ -22,7 +22,7 @@
 #include <numeric>    // std::accumulate, std::gcd, std::exclusive_scan
 #include <optional>   // std::optional
 #include <ostream>    // std::ostream
-#include <utility>    // std::move
+#include <utility>    // std::move, std::pair, std::make_pair
 #include <vector>     // std::vector
 
 [[nodiscard]] std::size_t calculate_data_set_num_entries(const std::size_t num_data_points, const std::size_t num_features) noexcept {
@@ -212,6 +212,52 @@ std::vector<memory_size> triangular_data_distribution::calculate_maximum_explici
     return res;
 }
 
+std::pair<memory_size, std::vector<memory_size>> triangular_data_distribution::calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(const std::size_t num_features, const std::size_t num_classes) const {
+    PLSSVM_ASSERT(num_features > 0, "At least one feature must be present!");
+    PLSSVM_ASSERT(num_classes > 0, "At least two classes must be present!");
+
+    const std::size_t num_places = this->num_places();
+    const std::size_t num_rows = this->num_rows() + 1;  // account for dimensional reduction
+    // first: system memory
+    // second: device memory
+    std::pair<memory_size, std::vector<memory_size>> res = std::make_pair(0_B, std::vector<memory_size>(num_places, 0_B));
+
+    for (std::size_t device_id = 0; device_id < num_places; ++device_id) {
+        // check whether the current device is responsible for at least one data point!
+        if (this->place_specific_num_rows(device_id) == 0) {
+            continue;
+        }
+
+        // data set including padding
+        const std::size_t data_set_size = ::calculate_data_set_num_entries(num_rows, num_features);
+
+        // the size of q_red
+        const std::size_t q_red_size = ::calculate_q_red_num_entries(num_rows);
+
+        // the B and C matrices for the explicit SYMM kernel
+        std::size_t blas_matrices_size = 2 * ::calculate_blas_matrix_entries(num_rows, num_classes);
+        if (device_id == 0 && num_places > 1) {
+            // device 0 has to save an additional matrix used to accumulate the partial results from the other devices
+            blas_matrices_size += ::calculate_blas_matrix_entries(num_rows, num_classes);
+        }
+
+        // add up the individual sizes and report the memory size in BYTES
+        // for streaming, the kernel matrix is on the host, while everything else is on the device
+        res.first += memory_size{ sizeof(real_type) };
+        if (device_id == 0) {
+            // we also store the data set, q vector and BLAS matrices on the system
+            res.first += memory_size{ sizeof(real_type) * (data_set_size + q_red_size + blas_matrices_size) };
+        }
+        res.second[device_id] = memory_size{ sizeof(real_type) * (q_red_size + std::max(data_set_size, blas_matrices_size)) };
+    }
+
+    return res;
+}
+
+std::vector<memory_size> triangular_data_distribution::calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(const std::size_t num_features, const std::size_t num_classes) const {
+    return this->calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_classes);
+}
+
 std::vector<memory_size> triangular_data_distribution::calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(const std::size_t num_features, const std::size_t num_classes) const {
     PLSSVM_ASSERT(num_features > 0, "At least one feature must be present!");
     PLSSVM_ASSERT(num_classes > 0, "At least two classes must be present!");
diff --git a/src/plssvm/detail/tracking/performance_tracker.cpp b/src/plssvm/detail/tracking/performance_tracker.cpp
index 58b4e975a..4a48e1fba 100644
--- a/src/plssvm/detail/tracking/performance_tracker.cpp
+++ b/src/plssvm/detail/tracking/performance_tracker.cpp
@@ -8,7 +8,7 @@
 
 #include "plssvm/detail/tracking/performance_tracker.hpp"
 
-#include "plssvm/constants.hpp"                          // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE, plssvm::PADDING_SIZE
+#include "plssvm/constants.hpp"                          // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::PADDING_SIZE
 #include "plssvm/detail/arithmetic_type_name.hpp"        // plssvm::detail::arithmetic_type_name
 #include "plssvm/detail/assert.hpp"                      // PLSSVM_ASSERT, PLSSVM_ASSERT_ENABLED
 #include "plssvm/detail/cmd/parser_predict.hpp"          // plssvm::detail::cmd::parser_predict
@@ -136,7 +136,7 @@ void performance_tracker::add_tracking_entry(const tracking_entry<cmd::parser_tr
         tracking_entries_[entry.entry_category].emplace("classification_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.classification) });
         tracking_entries_[entry.entry_category].emplace("backend", std::vector<std::string>{ fmt::format("{}", entry.entry_value.backend) });
         tracking_entries_[entry.entry_category].emplace("target", std::vector<std::string>{ fmt::format("{}", entry.entry_value.target) });
-        tracking_entries_[entry.entry_category].emplace("sycl_kernel_invocation_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_kernel_invocation_type) });
+        tracking_entries_[entry.entry_category].emplace("sycl_data_parallel_kernel", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_data_parallel_kernel) });
         tracking_entries_[entry.entry_category].emplace("sycl_implementation_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_implementation_type) });
         tracking_entries_[entry.entry_category].emplace("kokkos_execution_space", std::vector<std::string>{ fmt::format("{}", entry.entry_value.kokkos_execution_space) });
         tracking_entries_[entry.entry_category].emplace("strings_as_labels", std::vector<std::string>{ fmt::format("{}", entry.entry_value.strings_as_labels) });
@@ -155,6 +155,7 @@ void performance_tracker::add_tracking_entry(const tracking_entry<cmd::parser_pr
         tracking_entries_[entry.entry_category].emplace("task", std::vector<std::string>{ "predict" });
         tracking_entries_[entry.entry_category].emplace("backend", std::vector<std::string>{ fmt::format("{}", entry.entry_value.backend) });
         tracking_entries_[entry.entry_category].emplace("target", std::vector<std::string>{ fmt::format("{}", entry.entry_value.target) });
+        tracking_entries_[entry.entry_category].emplace("sycl_data_parallel_kernel", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_data_parallel_kernel) });
         tracking_entries_[entry.entry_category].emplace("sycl_implementation_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_implementation_type) });
         tracking_entries_[entry.entry_category].emplace("kokkos_execution_space", std::vector<std::string>{ fmt::format("{}", entry.entry_value.kokkos_execution_space) });
         tracking_entries_[entry.entry_category].emplace("strings_as_labels", std::vector<std::string>{ fmt::format("{}", entry.entry_value.strings_as_labels) });
@@ -253,16 +254,16 @@ void performance_tracker::save(std::ostream &out) {
     constexpr std::string_view hostname{ "not available" };
     constexpr std::string_view username{ "not available" };
 #endif
-    // check whether asserts are enabled
+    // check if asserts are enabled
     constexpr bool assert_enabled = PLSSVM_IS_DEFINED(PLSSVM_ENABLE_ASSERTS);
-    // check whether LTO has been enabled
+    // check if LTO has been enabled
     constexpr bool lto_enabled = PLSSVM_IS_DEFINED(PLSSVM_LTO_SUPPORTED);
-    // check whether fast-math has been enabled
+    // check if fast-math has been enabled
     constexpr bool fast_math_enabled = PLSSVM_IS_DEFINED(PLSSVM_USE_FAST_MATH);
-    // check whether the maximum allocatable memory size should be enforced
+    // check if the maximum allocatable memory size should be enforced
     constexpr bool enforce_max_mem_alloc_size = PLSSVM_IS_DEFINED(PLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE);
 
-    // begin a new YAML document (only with "---" multiple YAML docments in a single file are allowed)
+    // begin a new YAML document (only with "---" multiple YAML documents in a single file are allowed)
     out << "---\n";
 
     // output metadata information
@@ -280,7 +281,6 @@ void performance_tracker::save(std::ostream &out) {
         "  asserts:                           {}\n"
         "  enforce_max_mem_alloc_size:        {}\n"
         "  THREAD_BLOCK_SIZE:                 {}\n"
-        "  FEATURE_BLOCK_SIZE:                {}\n"
         "  INTERNAL_BLOCK_SIZE:               {}\n"
         "  PADDING_SIZE:                      {}\n",
         plssvm::detail::current_date_time(),
@@ -295,7 +295,6 @@ void performance_tracker::save(std::ostream &out) {
         assert_enabled,
         enforce_max_mem_alloc_size,
         THREAD_BLOCK_SIZE,
-        FEATURE_BLOCK_SIZE,
         INTERNAL_BLOCK_SIZE,
         PADDING_SIZE);
 
@@ -363,7 +362,7 @@ void performance_tracker::save(std::ostream &out) {
 
     out << "dependencies:\n";
 
-    // calculate the number of padding whitespaces for the dependencies category
+    // calculate the number of padding whitespaces for the "dependencies" category
     std::size_t max_dependency_entry_name_length = 18;  // fast_float_version
     if (detail::contains(tracking_entries_, "dependencies")) {
         for (const auto &[entry_name, entry_value] : tracking_entries_["dependencies"]) {
diff --git a/src/plssvm/solver_types.cpp b/src/plssvm/solver_types.cpp
index c830728ec..82a70f589 100644
--- a/src/plssvm/solver_types.cpp
+++ b/src/plssvm/solver_types.cpp
@@ -23,6 +23,8 @@ std::ostream &operator<<(std::ostream &out, const solver_type solving) {
             return out << "automatic";
         case solver_type::cg_explicit:
             return out << "cg_explicit";
+        case solver_type::cg_streaming:
+            return out << "cg_streaming";
         case solver_type::cg_implicit:
             return out << "cg_implicit";
     }
@@ -38,6 +40,8 @@ std::istream &operator>>(std::istream &in, solver_type &solving) {
         solving = solver_type::automatic;
     } else if (str == "cg_explicit") {
         solving = solver_type::cg_explicit;
+    } else if (str == "cg_streaming") {
+        solving = solver_type::cg_streaming;
     } else if (str == "cg_implicit") {
         solving = solver_type::cg_implicit;
     } else {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e7be2758e..3f830bd76 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -162,10 +162,10 @@ set(PLSSVM_BASE_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/backends/execution_range.cpp
     # since the Kokkos execution_space enumeration is used even if no Kokkos backend is available this is also tested in the base library
     ${CMAKE_CURRENT_LIST_DIR}/backends/Kokkos/execution_space.cpp
-    # since the SYCL implementation_type and kernel_invocation_type enumerations are used even if no SYCL backend is available these are also tested in the base
+    # since the SYCL implementation_type and data_parallel_kernel enumerations are used even if no SYCL backend is available these are also tested in the base
     # library
+    ${CMAKE_CURRENT_LIST_DIR}/backends/SYCL/data_parallel_kernels.cpp
     ${CMAKE_CURRENT_LIST_DIR}/backends/SYCL/implementation_types.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/backends/SYCL/kernel_invocation_types.cpp
     # since the stdpar implementation_type enumeration is used even if no stdpar backend is available this is also tested in the base library
     ${CMAKE_CURRENT_LIST_DIR}/backends/stdpar/implementation_types.cpp
     ${CMAKE_CURRENT_LIST_DIR}/data_set/classification/constructors.cpp
diff --git a/tests/backends/CUDA/detail/device_ptr.cpp b/tests/backends/CUDA/detail/device_ptr.cpp
index 53b746fcb..12535135c 100644
--- a/tests/backends/CUDA/detail/device_ptr.cpp
+++ b/tests/backends/CUDA/detail/device_ptr.cpp
@@ -43,10 +43,11 @@ TYPED_TEST(CUDADevicePtrConstruct, construct_invalid_queue) {
                               ::testing::HasSubstr(fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", num_devices, num_devices)));
 }
 
-template <typename T>
+template <typename T, bool UUA>
 struct cuda_device_ptr_test_type {
     using device_ptr_type = plssvm::cuda::detail::device_ptr<T>;
     using queue_type = int;
+    constexpr static bool use_usm_allocations = UUA;
 
     static const queue_type &default_queue() {
         static const queue_type queue = 0;
@@ -54,7 +55,7 @@ struct cuda_device_ptr_test_type {
     }
 };
 
-using cuda_device_ptr_tuple = std::tuple<cuda_device_ptr_test_type<float>, cuda_device_ptr_test_type<double>>;
+using cuda_device_ptr_tuple = std::tuple<cuda_device_ptr_test_type<float, false>, cuda_device_ptr_test_type<double, false>>;
 
 // the tests used in the instantiated GTest test suites
 using cuda_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<cuda_device_ptr_tuple>>;
@@ -65,3 +66,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtr, DevicePtr, cuda_device_ptr_type_gt
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtr, DevicePtrLayout, cuda_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrDeathTest, DevicePtrDeathTest, cuda_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+using cuda_device_ptr_usm_tuple = std::tuple<cuda_device_ptr_test_type<float, true>, cuda_device_ptr_test_type<double, true>>;
+
+// the tests used in the instantiated GTest test suites
+using cuda_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<cuda_device_ptr_usm_tuple>>;
+using cuda_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<cuda_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrUSM, DevicePtr, cuda_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrUSM, DevicePtrLayout, cuda_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrUSMDeathTest, DevicePtrDeathTest, cuda_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/HIP/detail/device_ptr.hip b/tests/backends/HIP/detail/device_ptr.hip
index b72d3f83c..bcd942243 100644
--- a/tests/backends/HIP/detail/device_ptr.hip
+++ b/tests/backends/HIP/detail/device_ptr.hip
@@ -42,10 +42,11 @@ TYPED_TEST(HIPDevicePtrConstruct, construct_invalid_queue) {
                               ::testing::HasSubstr(fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", num_devices, num_devices)));
 }
 
-template <typename T>
+template <typename T, bool UUA>
 struct hip_device_ptr_test_type {
     using device_ptr_type = plssvm::hip::detail::device_ptr<T>;
     using queue_type = int;
+    constexpr static bool use_usm_allocations = UUA;
 
     static const queue_type &default_queue() {
         static const queue_type queue = 0;
@@ -53,7 +54,7 @@ struct hip_device_ptr_test_type {
     }
 };
 
-using hip_device_ptr_tuple = std::tuple<hip_device_ptr_test_type<float>, hip_device_ptr_test_type<double>>;
+using hip_device_ptr_tuple = std::tuple<hip_device_ptr_test_type<float, false>, hip_device_ptr_test_type<double, false>>;
 
 // the tests used in the instantiated GTest test suites
 using hip_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<hip_device_ptr_tuple>>;
@@ -64,3 +65,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtr, DevicePtr, hip_device_ptr_type_gtes
 INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtr, DevicePtrLayout, hip_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrDeathTest, DevicePtrDeathTest, hip_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+using hip_device_ptr_usm_tuple = std::tuple<hip_device_ptr_test_type<float, true>, hip_device_ptr_test_type<double, true>>;
+
+// the tests used in the instantiated GTest test suites
+using hip_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<hip_device_ptr_usm_tuple>>;
+using hip_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<hip_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrUSM, DevicePtr, hip_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrUSM, DevicePtrLayout, hip_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrUSMDeathTest, DevicePtrDeathTest, hip_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index d1f12507c..b4819cca1 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -17,6 +17,8 @@ set(PLSSVM_KOKKOS_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/kokkos_csvm.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/memory_space.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/memory_space_type_traits.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
     ${CMAKE_CURRENT_LIST_DIR}/execution_space_type_traits.cpp
 )
diff --git a/tests/backends/Kokkos/detail/device_ptr.cpp b/tests/backends/Kokkos/detail/device_ptr.cpp
index ec525dad5..b3125979a 100644
--- a/tests/backends/Kokkos/detail/device_ptr.cpp
+++ b/tests/backends/Kokkos/detail/device_ptr.cpp
@@ -24,10 +24,11 @@
 
 #include <tuple>  // std::tuple
 
-template <typename T, plssvm::kokkos::execution_space exec_space>
+template <typename T, bool UAA, plssvm::kokkos::execution_space exec_space>
 struct kokkos_device_ptr_test_type {
     using device_ptr_type = plssvm::kokkos::detail::device_ptr<T>;
     using queue_type = plssvm::kokkos::detail::device_wrapper;
+    constexpr static bool use_usm_allocations = UAA;
     constexpr static plssvm::kokkos::execution_space space = exec_space;
 
     static const queue_type &default_queue() {
@@ -37,9 +38,9 @@ struct kokkos_device_ptr_test_type {
 };
 
 template <plssvm::kokkos::execution_space space>
-using kokkos_device_ptr_test_type_float = kokkos_device_ptr_test_type<float, space>;
+using kokkos_device_ptr_test_type_float = kokkos_device_ptr_test_type<float, false, space>;
 template <plssvm::kokkos::execution_space space>
-using kokkos_device_ptr_test_type_double = kokkos_device_ptr_test_type<double, space>;
+using kokkos_device_ptr_test_type_double = kokkos_device_ptr_test_type<double, false, space>;
 
 using kokkos_device_ptr_tuple = util::detail::concat_tuple_types_t<util::create_kokkos_test_tuple_t<kokkos_device_ptr_test_type_float>,
                                                                    util::create_kokkos_test_tuple_t<kokkos_device_ptr_test_type_double>>;
@@ -53,3 +54,25 @@ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtr, DevicePtr, kokkos_device_ptr_typ
 INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtr, DevicePtrLayout, kokkos_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtrDeathTest, DevicePtrDeathTest, kokkos_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+template <plssvm::kokkos::execution_space space>
+using kokkos_usm_device_ptr_test_type_float = kokkos_device_ptr_test_type<float, true, space>;
+template <plssvm::kokkos::execution_space space>
+using kokkos_usm_device_ptr_test_type_double = kokkos_device_ptr_test_type<double, true, space>;
+
+using kokkos_device_ptr_usm_tuple = util::detail::concat_tuple_types_t<util::create_kokkos_test_tuple_t<kokkos_usm_device_ptr_test_type_float>,
+                                                                       util::create_kokkos_test_tuple_t<kokkos_usm_device_ptr_test_type_double>>;
+
+// the tests used in the instantiated GTest test suites
+using kokkos_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<kokkos_device_ptr_usm_tuple>>;
+using kokkos_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<kokkos_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtrUSM, DevicePtr, kokkos_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtrUSM, DevicePtrLayout, kokkos_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtrUSMDeathTest, DevicePtrDeathTest, kokkos_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/Kokkos/detail/device_view_wrapper.cpp b/tests/backends/Kokkos/detail/device_view_wrapper.cpp
index 28dc97cba..a865e2dcd 100644
--- a/tests/backends/Kokkos/detail/device_view_wrapper.cpp
+++ b/tests/backends/Kokkos/detail/device_view_wrapper.cpp
@@ -10,8 +10,11 @@
 
 #include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp"
 
-#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v}
+#include "plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_memory_spaces
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"                     // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/execution_space.hpp"                           // plssvm::kokkos::memory_space
+#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp"               // plssvm::kokkos::kokkos_type_to_execution_space_v
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"                  // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t
 
 #include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace, Kokkos::View
 
@@ -24,72 +27,142 @@ TEST(KokkosDeviceViewWrapper, default_construct) {
     const plssvm::kokkos::detail::device_view_wrapper<double *> view{};
 
     // per std::variant specification, the first type in the underlying variant is now the active member
-    // -> this always corresponds to the first entry in our constexpr_available_execution_spaces array
-    constexpr auto spaces = plssvm::kokkos::detail::constexpr_available_execution_spaces();
-    EXPECT_EQ(view.get_execution_space(), spaces.front());
+    // -> this always corresponds to the first entry in our constexpr_available_memory_spaces array
+    constexpr auto spaces = plssvm::kokkos::detail::constexpr_available_memory_spaces();
+    EXPECT_EQ(view.get_memory_space(), spaces.front());
 }
 
 TEST(KokkosDeviceViewWrapper, construct) {
-    // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace
-    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = false;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, kokkos_memory_space>{}, use_usm_allocations };
 
-    // check that the device view is associated with the correct execution space
-    EXPECT_EQ(view.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>);
+    // check that the device view is associated with the correct memory space
+    EXPECT_EQ(view.get_memory_space(), plssvm::kokkos::kokkos_type_to_memory_space_v<kokkos_memory_space>);
 }
 
 TEST(KokkosDeviceViewWrapper, get) {
-    // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace
-    plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = false;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, kokkos_memory_space>{}, use_usm_allocations };
 
     // check that the returned Kokkos::View has the correct type
     constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
-    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), Kokkos::View<double *, Kokkos::DefaultExecutionSpace> &>();
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), Kokkos::View<double *, kokkos_memory_space> &>();
+    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), Kokkos::View<double *, kokkos_memory_space> &>();
 }
 
 TEST(KokkosDeviceViewWrapper, get_const) {
-    // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace
-    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<int **, Kokkos::DefaultExecutionSpace>{} };
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = false;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<int **, kokkos_memory_space>{}, use_usm_allocations };
 
     // check that the returned Kokkos::View has the correct type
     constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
-    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), const Kokkos::View<int **, Kokkos::DefaultExecutionSpace> &>();
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), const Kokkos::View<int **, kokkos_memory_space> &>();
+    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), const Kokkos::View<int **, kokkos_memory_space> &>();
 }
 
-TEST(KokkosDeviceViewWrapper, get_execution_space) {
-    // construct a device wrapper using the current Kokkos::DefaultExecutionSpace
-    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+TEST(KokkosDeviceViewWrapper, get_memory_space) {
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace>;
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, kokkos_memory_space>{} };
 
-    // check that the device view is associated with the correct execution space
-    EXPECT_EQ(view.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>);
+    // check that the device view is associated with the correct memory space
+    EXPECT_EQ(view.get_memory_space(), plssvm::kokkos::kokkos_type_to_memory_space_v<kokkos_memory_space>);
 }
 
 TEST(KokkosDeviceViewWrapper, equality) {
-    const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
-    const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+    // get the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace>;
+
+    const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View<double *, kokkos_memory_space>{} };
+    const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View<double *, kokkos_memory_space>{} };
 
     // should be equal
     EXPECT_TRUE(view1 == view2);
 }
 
 TEST(KokkosDeviceViewWrapper, inequality) {
-    const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
-    const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+    // get the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace>;
+
+    const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View<double *, kokkos_memory_space>{} };
+    const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View<double *, kokkos_memory_space>{} };
 
     // should not be unequal
     EXPECT_FALSE(view1 != view2);
 }
 
 TEST(KokkosDeviceViewWrapper, make_device_view_wrapper) {
+    // get the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = false;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+
     // create a device wrapper for the Kokkos::DefaultExecutionSpace
     const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} };
 
     // create device view wrapper
-    const plssvm::kokkos::detail::device_view_wrapper<double *> view = plssvm::kokkos::detail::make_device_view_wrapper<double *>(device, 42);
+    const plssvm::kokkos::detail::device_view_wrapper<double *> view = plssvm::kokkos::detail::make_device_view_wrapper<double *>(device, 42, use_usm_allocations);
 
     // check that the returned Kokkos::View has the correct type
     constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
-    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), const Kokkos::View<double *, Kokkos::DefaultExecutionSpace> &>();
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), const Kokkos::View<double *, kokkos_memory_space> &>();
 
     // check the number of elements
     EXPECT_EQ(view.get<space>().size(), std::size_t{ 42 });
 }
+
+TEST(KokkosUSMDeviceViewWrapper, construct) {
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = true;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, kokkos_memory_space>{}, use_usm_allocations };
+
+    // check that the device view is associated with the correct memory space
+    EXPECT_EQ(view.get_memory_space(), plssvm::kokkos::kokkos_type_to_memory_space_v<kokkos_memory_space>);
+}
+
+TEST(KokkosUSMDeviceViewWrapper, get) {
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = true;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, kokkos_memory_space>{}, use_usm_allocations };
+
+    // check that the returned Kokkos::View has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), Kokkos::View<double *, kokkos_memory_space> &>();
+}
+
+TEST(KokkosUSMDeviceViewWrapper, get_const) {
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = true;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<int **, kokkos_memory_space>{}, use_usm_allocations };
+
+    // check that the returned Kokkos::View has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), const Kokkos::View<int **, kokkos_memory_space> &>();
+}
+
+TEST(KokkosUSMDeviceViewWrapper, make_device_view_wrapper) {
+    // get the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = true;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+
+    // create a device wrapper for the Kokkos::DefaultExecutionSpace
+    const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} };
+
+    // create device view wrapper
+    const plssvm::kokkos::detail::device_view_wrapper<double *> view = plssvm::kokkos::detail::make_device_view_wrapper<double *>(device, 42, use_usm_allocations);
+
+    // check that the returned Kokkos::View has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), const Kokkos::View<double *, kokkos_memory_space> &>();
+
+    // check the number of elements
+    EXPECT_EQ((view.get<space, use_usm_allocations>().size()), std::size_t{ 42 });
+}
diff --git a/tests/backends/Kokkos/memory_space.cpp b/tests/backends/Kokkos/memory_space.cpp
new file mode 100644
index 000000000..c767cfc96
--- /dev/null
+++ b/tests/backends/Kokkos/memory_space.cpp
@@ -0,0 +1,71 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for functions related to the different Kokkos execution spaces.
+ */
+
+#include "plssvm/backends/Kokkos/memory_space.hpp"
+
+#include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
+
+#include "gtest/gtest.h"  // TEST, EXPECT_TRUE, EXPECT_FALSE
+
+#include <sstream>  // std::istringstream
+
+// check whether the plssvm::kokkos::memory_space -> std::string conversions are correct
+TEST(KokkosMemorySpace, to_string) {
+    // check conversions to std::string
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::host_space, "HostSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::cuda_space, "CudaSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::cuda_usm_space, "CudaUVMSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::hip_space, "HIPSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::hip_usm_space, "HIPManagedSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::sycl_space, "SYCLDeviceUSMSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::sycl_usm_space, "SYCLSharedUSMSpace");
+}
+
+TEST(KokkosMemorySpace, to_string_unknown) {
+    // check conversions to std::string from unknown memory_space
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::kokkos::memory_space>(7), "unknown");
+}
+
+// check whether the std::string -> plssvm::kokkos::memory_space conversions are correct
+TEST(KokkosMemorySpace, from_string) {
+    // check conversion from std::string
+    EXPECT_CONVERSION_FROM_STRING("HostSpace", plssvm::kokkos::memory_space::host_space);
+    EXPECT_CONVERSION_FROM_STRING("host_space", plssvm::kokkos::memory_space::host_space);
+    EXPECT_CONVERSION_FROM_STRING("CudaSpace", plssvm::kokkos::memory_space::cuda_space);
+    EXPECT_CONVERSION_FROM_STRING("cuda_space", plssvm::kokkos::memory_space::cuda_space);
+    EXPECT_CONVERSION_FROM_STRING("CudaUVMSpace", plssvm::kokkos::memory_space::cuda_usm_space);
+    EXPECT_CONVERSION_FROM_STRING("cuda_usm_space", plssvm::kokkos::memory_space::cuda_usm_space);
+    EXPECT_CONVERSION_FROM_STRING("HIPSpace", plssvm::kokkos::memory_space::hip_space);
+    EXPECT_CONVERSION_FROM_STRING("hip_space", plssvm::kokkos::memory_space::hip_space);
+    EXPECT_CONVERSION_FROM_STRING("HIPManagedSpace", plssvm::kokkos::memory_space::hip_usm_space);
+    EXPECT_CONVERSION_FROM_STRING("hip_usm_space", plssvm::kokkos::memory_space::hip_usm_space);
+    EXPECT_CONVERSION_FROM_STRING("SYCLDeviceUSMSpace", plssvm::kokkos::memory_space::sycl_space);
+    EXPECT_CONVERSION_FROM_STRING("sycl_space", plssvm::kokkos::memory_space::sycl_space);
+    EXPECT_CONVERSION_FROM_STRING("SYCLSharedUSMSpace", plssvm::kokkos::memory_space::sycl_usm_space);
+    EXPECT_CONVERSION_FROM_STRING("sycl_usm_space", plssvm::kokkos::memory_space::sycl_usm_space);
+}
+
+TEST(KokkosMemorySpace, from_string_unknown) {
+    // foo isn't a valid memory_space
+    std::istringstream input{ "foo" };
+    plssvm::kokkos::memory_space space{};
+    input >> space;
+    EXPECT_TRUE(input.fail());
+}
+
+TEST(KokkosMemorySpace, list_available_memory_spaces) {
+    const std::vector<plssvm::kokkos::memory_space> memory_spaces = plssvm::kokkos::list_available_memory_spaces();
+
+    // at least one must be available (host_space)!
+    EXPECT_GE(memory_spaces.size(), 1);
+
+    // the host memory space must always be present
+    EXPECT_THAT(memory_spaces, ::testing::Contains(plssvm::kokkos::memory_space::host_space));
+}
diff --git a/tests/backends/Kokkos/memory_space_type_traits.cpp b/tests/backends/Kokkos/memory_space_type_traits.cpp
new file mode 100644
index 000000000..6389f63ab
--- /dev/null
+++ b/tests/backends/Kokkos/memory_space_type_traits.cpp
@@ -0,0 +1,110 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for functions related to the different Kokkos execution spaces.
+ */
+
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"
+
+#include "gtest/gtest.h"  // TEST, EXPECT_EQ, ::testing::StaticAssertTypeEq
+
+TEST(KokkosMemorySpaceTypeTraits, memory_space_to_kokkos_type) {
+    // check conversions
+#if defined(KOKKOS_ENABLE_CUDA)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::cuda_space>, Kokkos::CudaSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::cuda_usm_space>, Kokkos::CudaUVMSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::hip_space>, Kokkos::HIPSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::hip_usm_space>, Kokkos::HIPManagedSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::sycl_space>, Kokkos::SYCLDeviceUSMSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::sycl_usm_space>, Kokkos::SYCLSharedUSMSpace>();
+#endif
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::host_space>, Kokkos::HostSpace>();
+}
+
+TEST(KokkosMemorySpaceTypeTraits, kokkos_type_to_memory_space) {
+    // check conversions
+#if defined(KOKKOS_ENABLE_CUDA)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::CudaSpace>, plssvm::kokkos::memory_space::cuda_space);
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::CudaUVMSpace>, plssvm::kokkos::memory_space::cuda_usm_space);
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::HIPSpace>, plssvm::kokkos::memory_space::hip_space);
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::HIPManagedSpace>, plssvm::kokkos::memory_space::hip_usm_space);
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::SYCLDeviceUSMSpace>, plssvm::kokkos::memory_space::sycl_space);
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::SYCLSharedUSMSpace>, plssvm::kokkos::memory_space::sycl_usm_space);
+#endif
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::HostSpace>, plssvm::kokkos::memory_space::host_space);
+}
+
+TEST(KokkosMemorySpaceTypeTraits, execution_space_to_memory_space) {
+    // check conversion
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::cuda, false>), plssvm::kokkos::memory_space::cuda_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::cuda, true>), plssvm::kokkos::memory_space::cuda_usm_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::hip, false>), plssvm::kokkos::memory_space::hip_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::hip, true>), plssvm::kokkos::memory_space::hip_usm_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::sycl, false>), plssvm::kokkos::memory_space::sycl_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::sycl, true>), plssvm::kokkos::memory_space::sycl_usm_space);
+
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::hpx, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::hpx, true>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openmp, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openmp, true>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openmp_target, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openmp_target, true>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openacc, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openacc, true>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::threads, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::threads, true>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::serial, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::serial, true>), plssvm::kokkos::memory_space::host_space);
+}
+
+TEST(KokkosMemorySpaceTypeTraits, kokkos_execution_space_to_kokkos_memory_space) {
+    // check conversions
+#if defined(KOKKOS_ENABLE_CUDA)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Cuda, false>, Kokkos::CudaSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Cuda, true>, Kokkos::CudaUVMSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::HIP, false>, Kokkos::HIPSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::HIP, true>, Kokkos::HIPManagedSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::SYCL, false>, Kokkos::SYCLDeviceUSMSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::SYCL, true>, Kokkos::SYCLSharedUSMSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::HPX, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::HPX, true>, Kokkos::HostSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::OpenMP, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::OpenMP, true>, Kokkos::HostSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::OpenMPTarget, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::OpenMPTarget, true>, Kokkos::HostSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::OpenACC, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::OpenACC, true>, Kokkos::HostSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Threads, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Threads, true>, Kokkos::HostSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Serial, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Serial, true>, Kokkos::HostSpace>();
+#endif
+}
diff --git a/tests/backends/OpenCL/detail/device_ptr.cpp b/tests/backends/OpenCL/detail/device_ptr.cpp
index aa5f678c6..e1180d385 100644
--- a/tests/backends/OpenCL/detail/device_ptr.cpp
+++ b/tests/backends/OpenCL/detail/device_ptr.cpp
@@ -12,7 +12,9 @@
 
 #include "plssvm/backends/OpenCL/detail/command_queue.hpp"  // plssvm::opencl::detail::command_queue
 #include "plssvm/backends/OpenCL/detail/context.hpp"        // plssvm::opencl::detail::context
-#include "plssvm/backends/OpenCL/detail/utility.hpp"        // plssvm::opencl::detail::get_contexts
+#include "plssvm/backends/OpenCL/detail/utility.hpp"        // plssvm::opencl::detail::{get_contexts, create_command_queues}
+#include "plssvm/kernel_function_types.hpp"                 // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                      // plssvm::target_platform, plssvm::determine_default_target_platform
 
 #include "tests/backends/generic_device_ptr_tests.hpp"  // generic device pointer tests to instantiate
 #include "tests/naming.hpp"                             // naming::test_parameter_to_name
@@ -23,19 +25,22 @@
 #include <tuple>   // std::tuple
 #include <vector>  // std::vector
 
-template <typename T>
+template <typename T, bool UUA>
 struct opencl_device_ptr_test_type {
     using device_ptr_type = plssvm::opencl::detail::device_ptr<T>;
     using queue_type = plssvm::opencl::detail::command_queue;
+    constexpr static bool use_usm_allocations = UUA;
 
     static const queue_type &default_queue() {
-        static const std::vector<plssvm::opencl::detail::context> contexts{ plssvm::opencl::detail::get_contexts(plssvm::target_platform::automatic).first };
-        static const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].device };
+        static const std::vector<plssvm::opencl::detail::context> contexts{ plssvm::opencl::detail::get_contexts(plssvm::determine_default_target_platform()).first };
+        // note: the kernel_function_type doesn't matter for the device_ptr tests!
+        static const auto command_queues{ plssvm::opencl::detail::create_command_queues({}, contexts, plssvm::determine_default_target_platform(), plssvm::kernel_function_type::linear) };
+        static const plssvm::opencl::detail::command_queue &queue{ command_queues.first.front() };
         return queue;
     }
 };
 
-using opencl_device_ptr_tuple = std::tuple<opencl_device_ptr_test_type<float>, opencl_device_ptr_test_type<double>>;
+using opencl_device_ptr_tuple = std::tuple<opencl_device_ptr_test_type<float, false>, opencl_device_ptr_test_type<double, false>>;
 
 // the tests used in the instantiated GTest test suites
 using opencl_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<opencl_device_ptr_tuple>>;
@@ -46,3 +51,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtr, DevicePtr, opencl_device_ptr_typ
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtr, DevicePtrLayout, opencl_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrDeathTest, DevicePtrDeathTest, opencl_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+using opencl_device_ptr_usm_tuple = std::tuple<opencl_device_ptr_test_type<float, true>, opencl_device_ptr_test_type<double, true>>;
+
+// the tests used in the instantiated GTest test suites
+using opencl_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<opencl_device_ptr_usm_tuple>>;
+using opencl_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<opencl_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrUSM, DevicePtr, opencl_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrUSM, DevicePtrLayout, opencl_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrUSMDeathTest, DevicePtrDeathTest, opencl_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/SYCL/AdaptiveCpp/CMakeLists.txt b/tests/backends/SYCL/AdaptiveCpp/CMakeLists.txt
index 31967bce4..5de76dd8a 100644
--- a/tests/backends/SYCL/AdaptiveCpp/CMakeLists.txt
+++ b/tests/backends/SYCL/AdaptiveCpp/CMakeLists.txt
@@ -15,10 +15,10 @@ set(PLSSVM_SYCL_ADAPTIVECPP_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/adaptivecpp_csvm.cpp
     # ~~~
-    # since the SYCL implementation_type and kernel_invocation_type enumerations are used even if no SYCL backend
+    # since the SYCL implementation_type and data_parallel_kernel enumerations are used even if no SYCL backend
     # is available these are also tested in the base library
     # ${CMAKE_CURRENT_LIST_DIR}/../implementation_types.cpp
-    # ${CMAKE_CURRENT_LIST_DIR}/../kernel_invocation_types.cpp
+    # ${CMAKE_CURRENT_LIST_DIR}/../data_parallel_kernels.cpp
     # ~~~
 )
 
diff --git a/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp b/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp
index 446374ef6..c1a031979 100644
--- a/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp
+++ b/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp
@@ -8,14 +8,14 @@
  * @brief Tests for the functionality related to the SYCL backend using AdaptiveCpp as SYCL implementation.
  */
 
-#include "plssvm/backend_types.hpp"                          // plssvm::csvm_to_backend_type_v
-#include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp"         // plssvm::adaptivecpp::{csvm, csvc, csvr}
-#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::adaptivecpp::backend_exception
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/detail/arithmetic_type_name.hpp"            // plssvm::detail::arithmetic_type_name
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_kernel_invocation_type
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::csvm_to_backend_type_v
+#include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp"       // plssvm::adaptivecpp::{csvm, csvc, csvr}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/exceptions.hpp"             // plssvm::adaptivecpp::backend_exception
+#include "plssvm/detail/arithmetic_type_name.hpp"          // plssvm::detail::arithmetic_type_name
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/parameter.hpp"                            // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_data_parallel_kernel
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "tests/backends/generic_base_csvc_tests.hpp"                 // generic C-SVC tests to instantiate
 #include "tests/backends/generic_base_csvm_tests.hpp"                 // generic C-SVM tests to instantiate
@@ -49,7 +49,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, default_construct) {
 
     // default constructor must always work
     EXPECT_NO_THROW(csvm_type{});
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(AdaptiveCppCSVMConstructor, construct_parameter) {
@@ -57,7 +57,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_parameter) {
 
     // the automatic target platform must always be available
     EXPECT_NO_THROW(csvm_type{ plssvm::parameter{} });
-    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_parameter) {
@@ -69,33 +69,33 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_parameter) {
     // every target is allowed for SYCL
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
@@ -107,7 +107,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_named_args) {
     // every target is allowed for SYCL
     EXPECT_NO_THROW((csvm_type{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_named_args) {
@@ -117,81 +117,81 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_named_args) {
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 }
 
-TYPED_TEST(AdaptiveCppCSVMConstructor, get_kernel_invocation_type) {
+TYPED_TEST(AdaptiveCppCSVMConstructor, get_data_parallel_kernel) {
     using csvm_type = typename TestFixture::fixture_csvm_type;
 
     // construct default C-SVM
     const csvm_type svm{ plssvm::parameter{} };
 
-    // after construction: get_kernel_invocation_type must refer to a plssvm::sycl::kernel_invocation_type that is not automatic
-    EXPECT_NE(svm.get_kernel_invocation_type(), plssvm::sycl::kernel_invocation_type::automatic);
+    // after construction: get_data_parallel_kernel must refer to a plssvm::sycl::data_parallel_kernel that is not automatic
+    EXPECT_NE(svm.get_data_parallel_kernel(), plssvm::sycl::data_parallel_kernel::automatic);
 }
 
-template <bool mock_grid_size, plssvm::sycl::kernel_invocation_type invocation_type>
+template <bool mock_grid_size, plssvm::sycl::data_parallel_kernel data_parallel_kernel_type>
 struct adaptivecpp_csvm_test_type {
     using mock_csvm_type = mock_adaptivecpp_csvm<mock_grid_size>;
     using csvm_type = plssvm::adaptivecpp::csvm;
     using csvc_type = plssvm::adaptivecpp::csvc;
     using csvr_type = plssvm::adaptivecpp::csvr;
     using device_ptr_type = typename csvm_type::device_ptr_type;
-    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_kernel_invocation_type, invocation_type));
+    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_data_parallel_kernel, data_parallel_kernel_type));
 };
 
 // a tuple containing the test structs
 using adaptivecpp_csvm_test_tuple = std::tuple<
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::hierarchical>,
-    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::scoped>,
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::hierarchical>,
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::scoped>,
 #endif
-    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::basic>,
-    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::work_group>>;
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::basic>,
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::work_group>>;
 
 // the tests used in the instantiated GTest test suites
 // general test types
@@ -239,11 +239,11 @@ INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppCSVMDeathTest, GenericGPUCSVMDeathTest
 
 using adaptivecpp_mock_csvm_test_tuple = std::tuple<
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::hierarchical>,
-    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::scoped>,
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::hierarchical>,
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::scoped>,
 #endif
-    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::basic>,
-    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::work_group>>;
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::basic>,
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::work_group>>;
 
 using adaptivecpp_mock_csvm_test_type_list = util::cartesian_type_product_t<adaptivecpp_mock_csvm_test_tuple>;
 
diff --git a/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp b/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
index 2b1f5f558..7bfb8cd43 100644
--- a/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
+++ b/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
@@ -20,10 +20,11 @@
 
 #include <tuple>  // std::tuple
 
-template <typename T>
+template <typename T, bool UUA>
 struct adaptivecpp_device_ptr_test_type {
     using device_ptr_type = plssvm::adaptivecpp::detail::device_ptr<T>;
     using queue_type = typename device_ptr_type::queue_type;
+    constexpr static bool use_usm_allocations = UUA;
 
     static const queue_type &default_queue() {
         static const queue_type queue = plssvm::adaptivecpp::detail::get_default_queue();
@@ -31,7 +32,7 @@ struct adaptivecpp_device_ptr_test_type {
     }
 };
 
-using adaptivecpp_device_ptr_tuple = std::tuple<adaptivecpp_device_ptr_test_type<float>, adaptivecpp_device_ptr_test_type<double>>;
+using adaptivecpp_device_ptr_tuple = std::tuple<adaptivecpp_device_ptr_test_type<float, false>, adaptivecpp_device_ptr_test_type<double, false>>;
 
 // the tests used in the instantiated GTest test suites
 using adaptivecpp_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<adaptivecpp_device_ptr_tuple>>;
@@ -42,3 +43,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtr, DevicePtr, adaptivecpp_devi
 INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtr, DevicePtrLayout, adaptivecpp_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrDeathTest, DevicePtrDeathTest, adaptivecpp_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+using adaptivecpp_device_ptr_usm_tuple = std::tuple<adaptivecpp_device_ptr_test_type<float, true>, adaptivecpp_device_ptr_test_type<double, true>>;
+
+// the tests used in the instantiated GTest test suites
+using adaptivecpp_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<adaptivecpp_device_ptr_usm_tuple>>;
+using adaptivecpp_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<adaptivecpp_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrUSM, DevicePtr, adaptivecpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrUSM, DevicePtrLayout, adaptivecpp_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrUSMDeathTest, DevicePtrDeathTest, adaptivecpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp b/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp
index e909fa2a9..ffc6efd11 100644
--- a/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp
+++ b/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp
@@ -11,7 +11,7 @@
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp"
 
 #include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range
@@ -67,7 +67,7 @@ TEST(AdaptiveCppUtility, get_execution_range_basic) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::basic>(grid, block);
+    const ::sycl::range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::basic>(grid, block);
 
     EXPECT_EQ(exec, (sycl::range<2>{ 512ull, 512ull }));
 }
@@ -78,7 +78,7 @@ TEST(AdaptiveCppUtility, get_execution_range_work_group) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::work_group>(grid, block);
+    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::work_group>(grid, block);
 
     EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 512ull, 512ull }, ::sycl::range<2>{ 8ull, 8ull } }));
 }
@@ -89,7 +89,7 @@ TEST(AdaptiveCppUtility, get_execution_range_hierarchical) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::hierarchical>(grid, block);
+    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::hierarchical>(grid, block);
 
     EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } }));
 }
@@ -100,7 +100,7 @@ TEST(AdaptiveCppUtility, get_execution_range_scoped) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::scoped>(grid, block);
+    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::scoped>(grid, block);
 
     EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } }));
 }
diff --git a/tests/backends/SYCL/DPCPP/CMakeLists.txt b/tests/backends/SYCL/DPCPP/CMakeLists.txt
index e36545f08..7c599b595 100644
--- a/tests/backends/SYCL/DPCPP/CMakeLists.txt
+++ b/tests/backends/SYCL/DPCPP/CMakeLists.txt
@@ -15,10 +15,10 @@ set(PLSSVM_SYCL_DPCPP_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/dpcpp_csvm.cpp
     # ~~~
-    # since the SYCL implementation_type and kernel_invocation_type enumerations are used even if no SYCL backend
+    # since the SYCL implementation_type and data_parallel_kernel enumerations are used even if no SYCL backend
     # is available these are also tested in the base library
     # ${CMAKE_CURRENT_LIST_DIR}/../implementation_types.cpp
-    # ${CMAKE_CURRENT_LIST_DIR}/../kernel_invocation_types.cpp
+    # ${CMAKE_CURRENT_LIST_DIR}/../data_parallel_kernels.cpp
     # ~~~
 )
 
diff --git a/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp b/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp
index 06722fc3f..afbc9cd1b 100644
--- a/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp
+++ b/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp
@@ -20,10 +20,11 @@
 
 #include <tuple>  // std::tuple
 
-template <typename T>
+template <typename T, bool UUA>
 struct dpcpp_device_ptr_test_type {
     using device_ptr_type = plssvm::dpcpp::detail::device_ptr<T>;
     using queue_type = typename device_ptr_type::queue_type;
+    constexpr static bool use_usm_allocations = UUA;
 
     static const queue_type &default_queue() {
         static const queue_type queue = plssvm::dpcpp::detail::get_default_queue();
@@ -31,7 +32,7 @@ struct dpcpp_device_ptr_test_type {
     }
 };
 
-using dpcpp_device_ptr_tuple = std::tuple<dpcpp_device_ptr_test_type<float>, dpcpp_device_ptr_test_type<double>>;
+using dpcpp_device_ptr_tuple = std::tuple<dpcpp_device_ptr_test_type<float, false>, dpcpp_device_ptr_test_type<double, false>>;
 
 // the tests used in the instantiated GTest test suites
 using dpcpp_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<dpcpp_device_ptr_tuple>>;
@@ -42,3 +43,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtr, DevicePtr, dpcpp_device_ptr_type_
 INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtr, DevicePtrLayout, dpcpp_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrDeathTest, DevicePtrDeathTest, dpcpp_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+using dpcpp_device_ptr_usm_tuple = std::tuple<dpcpp_device_ptr_test_type<float, true>, dpcpp_device_ptr_test_type<double, true>>;
+
+// the tests used in the instantiated GTest test suites
+using dpcpp_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<dpcpp_device_ptr_usm_tuple>>;
+using dpcpp_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<dpcpp_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrUSM, DevicePtr, dpcpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrUSM, DevicePtrLayout, dpcpp_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrUSMDeathTest, DevicePtrDeathTest, dpcpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/SYCL/DPCPP/detail/utility.cpp b/tests/backends/SYCL/DPCPP/detail/utility.cpp
index 84b2d60f9..d9b4d83ec 100644
--- a/tests/backends/SYCL/DPCPP/detail/utility.cpp
+++ b/tests/backends/SYCL/DPCPP/detail/utility.cpp
@@ -11,7 +11,7 @@
 #include "plssvm/backends/SYCL/DPCPP/detail/utility.hpp"
 
 #include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range
@@ -66,7 +66,7 @@ TEST(DPCPPUtility, get_execution_range_basic) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::basic>(grid, block);
+    const ::sycl::range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::basic>(grid, block);
 
     EXPECT_EQ(exec, (sycl::range<2>{ 512ull, 512ull }));
 }
@@ -77,7 +77,7 @@ TEST(DPCPPUtility, get_execution_range_work_group) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::work_group>(grid, block);
+    const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::work_group>(grid, block);
 
     EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 512ull, 512ull }, ::sycl::range<2>{ 8ull, 8ull } }));
 }
@@ -88,7 +88,7 @@ TEST(DPCPPUtility, get_execution_range_hierarchical) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::hierarchical>(grid, block);
+    const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::hierarchical>(grid, block);
 
     EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } }));
 }
diff --git a/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp b/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp
index ada2f4b56..c3987bf75 100644
--- a/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp
+++ b/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp
@@ -8,14 +8,14 @@
  * @brief Tests for the functionality related to the SYCL backend using DPC++ as SYCL implementation.
  */
 
-#include "plssvm/backend_types.hpp"                          // plssvm::csvm_to_backend_type_v
-#include "plssvm/backends/SYCL/DPCPP/csvm.hpp"               // plssvm::dpcpp::{csvm, csvc, csvr}
-#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::dpcpp::backend_exception
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/detail/arithmetic_type_name.hpp"            // plssvm::detail::arithmetic_type_name
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_kernel_invocation_type
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::csvm_to_backend_type_v
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/DPCPP/csvm.hpp"             // plssvm::dpcpp::{csvm, csvc, csvr}
+#include "plssvm/backends/SYCL/exceptions.hpp"             // plssvm::dpcpp::backend_exception
+#include "plssvm/detail/arithmetic_type_name.hpp"          // plssvm::detail::arithmetic_type_name
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/parameter.hpp"                            // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_data_parallel_kernel
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "tests/backends/generic_base_csvc_tests.hpp"     // generic C-SVC tests to instantiate
 #include "tests/backends/generic_base_csvm_tests.hpp"     // generic C-SVM tests to instantiate
@@ -49,7 +49,7 @@ TYPED_TEST(DPCPPCSVMConstructor, default_construct) {
 
     // default constructor must always work
     EXPECT_NO_THROW(csvm_type{});
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(DPCPPCSVMConstructor, construct_parameter) {
@@ -57,7 +57,7 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_parameter) {
 
     // the automatic target platform must always be available
     EXPECT_NO_THROW(csvm_type{ plssvm::parameter{} });
-    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_parameter) {
@@ -69,33 +69,33 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_parameter) {
     // every target is allowed for SYCL
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
@@ -107,7 +107,7 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_named_args) {
     // every target is allowed for SYCL
     EXPECT_NO_THROW((csvm_type{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_named_args) {
@@ -117,80 +117,80 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_named_args) {
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 }
 
-TYPED_TEST(DPCPPCSVMConstructor, get_kernel_invocation_type) {
+TYPED_TEST(DPCPPCSVMConstructor, get_data_parallel_kernel) {
     using csvm_type = typename TestFixture::fixture_csvm_type;
 
     // construct default C-SVM
     const csvm_type svm{ plssvm::parameter{} };
 
-    // after construction: get_kernel_invocation_type must refer to a plssvm::sycl::kernel_invocation_type that is not automatic
-    EXPECT_NE(svm.get_kernel_invocation_type(), plssvm::sycl::kernel_invocation_type::automatic);
+    // after construction: get_data_parallel_kernel must refer to a plssvm::sycl::data_parallel_kernel that is not automatic
+    EXPECT_NE(svm.get_data_parallel_kernel(), plssvm::sycl::data_parallel_kernel::automatic);
 }
 
-template <bool mock_grid_size, plssvm::sycl::kernel_invocation_type invocation_type>
+template <bool mock_grid_size, plssvm::sycl::data_parallel_kernel data_parallel_kernel_type>
 struct dpcpp_csvm_test_type {
     using mock_csvm_type = mock_dpcpp_csvm<mock_grid_size>;
     using csvm_type = plssvm::dpcpp::csvm;
     using csvc_type = plssvm::dpcpp::csvc;
     using csvr_type = plssvm::dpcpp::csvr;
     using device_ptr_type = typename csvm_type::device_ptr_type;
-    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_kernel_invocation_type, invocation_type));
+    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_data_parallel_kernel, data_parallel_kernel_type));
 };
 
 // a tuple containing the test structs
 using dpcpp_csvm_test_tuple = std::tuple<
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-    dpcpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::hierarchical>,
+    dpcpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::hierarchical>,
 #endif
-    dpcpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::basic>,
-    dpcpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::work_group>>;
+    dpcpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::basic>,
+    dpcpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::work_group>>;
 
 // the tests used in the instantiated GTest test suites
 // general test types
@@ -238,10 +238,10 @@ INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPCSVMDeathTest, GenericGPUCSVMDeathTest, dpcp
 
 using dpcpp_mock_csvm_test_tuple = std::tuple<
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-    dpcpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::hierarchical>,
+    dpcpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::hierarchical>,
 #endif
-    dpcpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::basic>,
-    dpcpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::work_group>>;
+    dpcpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::basic>,
+    dpcpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::work_group>>;
 
 using dpcpp_mock_csvm_test_type_list = util::cartesian_type_product_t<dpcpp_mock_csvm_test_tuple>;
 
diff --git a/tests/backends/SYCL/data_parallel_kernels.cpp b/tests/backends/SYCL/data_parallel_kernels.cpp
new file mode 100644
index 000000000..e1fbaeff4
--- /dev/null
+++ b/tests/backends/SYCL/data_parallel_kernels.cpp
@@ -0,0 +1,72 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for functions related to the different SYCL data parallel kernels.
+ */
+
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::{data_parallel_kernel, list_available_sycl_data_parallel_kernels}
+
+#include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
+
+#include "gmock/gmock.h"  // EXPECT_THAT; ::testing::Contains
+#include "gtest/gtest.h"  // TEST, EXPECT_TRUE, EXPECT_GE
+
+#include <sstream>  // std::istringstream
+
+// check whether the plssvm::sycl::data_parallel_kernel -> std::string conversions are correct
+TEST(SYCLDataParallelKernel, to_string) {
+    // check conversions to std::string
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::data_parallel_kernel::automatic, "automatic");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::data_parallel_kernel::basic, "basic");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::data_parallel_kernel::work_group, "work_group");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::data_parallel_kernel::hierarchical, "hierarchical");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::data_parallel_kernel::scoped, "scoped");
+}
+
+TEST(SYCLDataParallelKernel, to_string_unknown) {
+    // check conversions to std::string from unknown file_format_type
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::sycl::data_parallel_kernel>(5), "unknown");
+}
+
+// check whether the std::string -> plssvm::sycl::data_parallel_kernel conversions are correct
+TEST(SYCLDataParallelKernel, from_string) {
+    // check conversion from std::string
+    EXPECT_CONVERSION_FROM_STRING("automatic", plssvm::sycl::data_parallel_kernel::automatic);
+    EXPECT_CONVERSION_FROM_STRING("AUTOMATIC", plssvm::sycl::data_parallel_kernel::automatic);
+    EXPECT_CONVERSION_FROM_STRING("auto", plssvm::sycl::data_parallel_kernel::automatic);
+    EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::sycl::data_parallel_kernel::automatic);
+    EXPECT_CONVERSION_FROM_STRING("basic", plssvm::sycl::data_parallel_kernel::basic);
+    EXPECT_CONVERSION_FROM_STRING("BASIC", plssvm::sycl::data_parallel_kernel::basic);
+    EXPECT_CONVERSION_FROM_STRING("work_group", plssvm::sycl::data_parallel_kernel::work_group);
+    EXPECT_CONVERSION_FROM_STRING("WORK-GROUP", plssvm::sycl::data_parallel_kernel::work_group);
+    EXPECT_CONVERSION_FROM_STRING("nd_range", plssvm::sycl::data_parallel_kernel::work_group);
+    EXPECT_CONVERSION_FROM_STRING("ND-RANGE", plssvm::sycl::data_parallel_kernel::work_group);
+    EXPECT_CONVERSION_FROM_STRING("hierarchical", plssvm::sycl::data_parallel_kernel::hierarchical);
+    EXPECT_CONVERSION_FROM_STRING("HIERARCHICAL", plssvm::sycl::data_parallel_kernel::hierarchical);
+    EXPECT_CONVERSION_FROM_STRING("scoped", plssvm::sycl::data_parallel_kernel::scoped);
+    EXPECT_CONVERSION_FROM_STRING("SCOPED", plssvm::sycl::data_parallel_kernel::scoped);
+}
+
+TEST(SYCLDataParallelKernel, from_string_unknown) {
+    // foo isn't a valid file_format_type
+    std::istringstream input{ "foo" };
+    plssvm::sycl::data_parallel_kernel data_parallel_kernel_type{};
+    input >> data_parallel_kernel_type;
+    EXPECT_TRUE(input.fail());
+}
+
+TEST(SYCLDataParallelKernel, minimal_available_sycl_data_parallel_kernels) {
+    const std::vector<plssvm::sycl::data_parallel_kernel> data_parallel_kernel_types = plssvm::sycl::list_available_sycl_data_parallel_kernels();
+
+    // at least three must be available (automatic, basic, and work_group)!
+    EXPECT_GE(data_parallel_kernel_types.size(), 3);
+
+    // check for the data parallel kernels that must always be present
+    EXPECT_THAT(data_parallel_kernel_types, ::testing::Contains(plssvm::sycl::data_parallel_kernel::automatic));
+    EXPECT_THAT(data_parallel_kernel_types, ::testing::Contains(plssvm::sycl::data_parallel_kernel::basic));
+    EXPECT_THAT(data_parallel_kernel_types, ::testing::Contains(plssvm::sycl::data_parallel_kernel::work_group));
+}
diff --git a/tests/backends/SYCL/kernel_invocation_types.cpp b/tests/backends/SYCL/kernel_invocation_types.cpp
deleted file mode 100644
index 3227cb077..000000000
--- a/tests/backends/SYCL/kernel_invocation_types.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- *
- * @brief Tests for functions related to the different SYCL kernel invocation types.
- */
-
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-
-#include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
-
-#include "gmock/gmock.h"  // EXPECT_THAT; ::testing::Contains
-#include "gtest/gtest.h"  // TEST, EXPECT_TRUE, EXPECT_GE
-
-#include <sstream>  // std::istringstream
-
-// check whether the plssvm::sycl::kernel_invocation_type -> std::string conversions are correct
-TEST(SYCLKernelInvocationType, to_string) {
-    // check conversions to std::string
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::automatic, "automatic");
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::basic, "basic");
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::work_group, "work_group");
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::hierarchical, "hierarchical");
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::scoped, "scoped");
-}
-
-TEST(SYCLKernelInvocationType, to_string_unknown) {
-    // check conversions to std::string from unknown file_format_type
-    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::sycl::kernel_invocation_type>(5), "unknown");
-}
-
-// check whether the std::string -> plssvm::sycl::kernel_invocation_type conversions are correct
-TEST(SYCLKernelInvocationType, from_string) {
-    // check conversion from std::string
-    EXPECT_CONVERSION_FROM_STRING("automatic", plssvm::sycl::kernel_invocation_type::automatic);
-    EXPECT_CONVERSION_FROM_STRING("AUTOMATIC", plssvm::sycl::kernel_invocation_type::automatic);
-    EXPECT_CONVERSION_FROM_STRING("auto", plssvm::sycl::kernel_invocation_type::automatic);
-    EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::sycl::kernel_invocation_type::automatic);
-    EXPECT_CONVERSION_FROM_STRING("basic", plssvm::sycl::kernel_invocation_type::basic);
-    EXPECT_CONVERSION_FROM_STRING("BASIC", plssvm::sycl::kernel_invocation_type::basic);
-    EXPECT_CONVERSION_FROM_STRING("work_group", plssvm::sycl::kernel_invocation_type::work_group);
-    EXPECT_CONVERSION_FROM_STRING("WORK-GROUP", plssvm::sycl::kernel_invocation_type::work_group);
-    EXPECT_CONVERSION_FROM_STRING("nd_range", plssvm::sycl::kernel_invocation_type::work_group);
-    EXPECT_CONVERSION_FROM_STRING("ND-RANGE", plssvm::sycl::kernel_invocation_type::work_group);
-    EXPECT_CONVERSION_FROM_STRING("hierarchical", plssvm::sycl::kernel_invocation_type::hierarchical);
-    EXPECT_CONVERSION_FROM_STRING("HIERARCHICAL", plssvm::sycl::kernel_invocation_type::hierarchical);
-    EXPECT_CONVERSION_FROM_STRING("scoped", plssvm::sycl::kernel_invocation_type::scoped);
-    EXPECT_CONVERSION_FROM_STRING("SCOPED", plssvm::sycl::kernel_invocation_type::scoped);
-}
-
-TEST(SYCLKernelInvocationType, from_string_unknown) {
-    // foo isn't a valid file_format_type
-    std::istringstream input{ "foo" };
-    plssvm::sycl::kernel_invocation_type invocation_type{};
-    input >> invocation_type;
-    EXPECT_TRUE(input.fail());
-}
-
-TEST(SYCLKernelInvocationType, minimal_available_sycl_kernel_invocation_types) {
-    const std::vector<plssvm::sycl::kernel_invocation_type> invocation_type = plssvm::sycl::list_available_sycl_kernel_invocation_types();
-
-    // at least three must be available (automatic, basic, and work_group)!
-    EXPECT_GE(invocation_type.size(), 3);
-
-    // check for the kernel invocation types that must always be present
-    EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::automatic));
-    EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::basic));
-    EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::work_group));
-}
diff --git a/tests/backends/generic_base_csvm_tests.hpp b/tests/backends/generic_base_csvm_tests.hpp
index fd93963c1..b820acc10 100644
--- a/tests/backends/generic_base_csvm_tests.hpp
+++ b/tests/backends/generic_base_csvm_tests.hpp
@@ -41,6 +41,7 @@
 
 #include <cmath>    // std::sqrt, std::abs
 #include <cstddef>  // std::size_t
+#include <cstring>  // std::memcpy
 #include <limits>   // std::numeric_limits::epsilon
 #include <memory>   // std::unique_ptr, std::make_unique
 #include <tuple>    // std::ignore, std::tuple, std::make_tuple
@@ -86,7 +87,10 @@ template <typename csvm_type, typename device_ptr_type, typename matrix_type, ty
 
     if constexpr (plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::hpx) {
         // only a single device for OpenMP, stdpar, and HPX on the CPU
-        result[0] = plssvm::detail::move_only_any{ calculate_partial_kernel_matrix(0, matr.num_rows()) };
+        const std::vector<real_type> partial_kernel_matrix = calculate_partial_kernel_matrix(0, matr.num_rows());
+        auto ptr = std::make_unique<real_type[]>(partial_kernel_matrix.size());
+        std::memcpy(ptr.get(), partial_kernel_matrix.data(), partial_kernel_matrix.size() * sizeof(real_type));
+        result[0] = plssvm::detail::move_only_any{ std::move(ptr) };
     } else {
         for (std::size_t device_id = 0; device_id < csvm.num_available_devices(); ++device_id) {
             auto &device = csvm.devices_[device_id];
@@ -193,6 +197,7 @@ template <typename csvm_type, typename device_ptr_type, typename matrix_type, ty
                 return result;  // dummy return only necessary for the DeathTests -> VALUE NOT USED!
             }
         case plssvm::solver_type::cg_explicit:
+        case plssvm::solver_type::cg_streaming:
             // no additional arguments are used
             return init_explicit_matrices<csvm_type, device_ptr_type>(std::move(matr), csvm);
         case plssvm::solver_type::cg_implicit:
@@ -803,7 +808,9 @@ TYPED_TEST_P(GenericCSVMSolver, solve_lssvm_system_of_linear_equations) {
     // check the calculated result for correctness
     EXPECT_FLOATING_POINT_MATRIX_NEAR_EPS(calculated_x, correct_x, 1e6);  // due to hand provided results
     for (const auto rho : calculated_rho) {
-        EXPECT_FLOATING_POINT_NEAR_EPS(std::abs(rho), std::abs(calculated_rho.front()), 1e6);  // due to hand provided results
+        const auto rho_abs = std::abs(rho);
+        const auto calculated_rho_abs = std::abs(calculated_rho.front());
+        EXPECT_FLOATING_POINT_NEAR_EPS(rho_abs, calculated_rho_abs, 1e6);  // due to hand provided results
     }
     EXPECT_THAT(num_iters, ::testing::Each(::testing::Gt(0)));
 }
@@ -850,7 +857,8 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) {
     const mock_csvm_type svm = util::construct_from_tuple<mock_csvm_type>(params, csvm_test_type::additional_arguments);
     const std::size_t num_devices = svm.num_available_devices();
     // be sure to use the correct data distribution
-    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices);
+    const plssvm::detail::triangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices };
+    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(dist);
 
     // automatic solver type not permitted
     if constexpr (solver == plssvm::solver_type::automatic) {
@@ -859,7 +867,7 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) {
 #else
         SUCCEED() << "Solver type is automatic, but assertions are disabled!";
 #endif
-    } else if constexpr (solver == plssvm::solver_type::cg_explicit) {
+    } else if constexpr (solver == plssvm::solver_type::cg_explicit || solver == plssvm::solver_type::cg_streaming) {
         // run the assemble the kernel matrix kernels
         const std::vector<plssvm::detail::move_only_any> kernel_matrix_d = svm.assemble_kernel_matrix(solver, params, data, q_red, QA_cost);
         ASSERT_EQ(kernel_matrix_d.size(), num_devices);
@@ -880,7 +888,9 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) {
                 // get result based on used backend
                 std::vector<plssvm::real_type> kernel_matrix{};
                 if constexpr (plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::hpx) {
-                    kernel_matrix = plssvm::detail::move_only_any_cast<std::vector<plssvm::real_type>>(kernel_matrix_d[device_id]);  // std::vector
+                    const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast<const std::unique_ptr<plssvm::real_type[]> &>(kernel_matrix_d[device_id]);  // std::unique_ptr<plssvm::real_type[]>
+                    kernel_matrix.resize(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));
+                    std::memcpy(kernel_matrix.data(), kernel_matrix_d_ptr.get(), kernel_matrix.size() * sizeof(plssvm::real_type));
                 } else {
                     const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast<const device_ptr_type &>(kernel_matrix_d[device_id]);  // device_ptr -> convert it to a std::vector
                     kernel_matrix.resize(kernel_matrix_d_ptr.size_padded());
@@ -960,7 +970,8 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) {
     const mock_csvm_type svm = util::construct_from_tuple<mock_csvm_type>(params, csvm_test_type::additional_arguments);
     const std::size_t num_devices = svm.num_available_devices();
     // be sure to use the correct data distribution
-    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices);
+    const plssvm::detail::triangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices };
+    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(dist);
 
     // automatic solver type not permitted
     if constexpr (solver == plssvm::solver_type::automatic) {
@@ -969,7 +980,7 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) {
 #else
         SUCCEED() << "Solver type is automatic, but assertions are disabled!";
 #endif
-    } else if constexpr (solver == plssvm::solver_type::cg_explicit) {
+    } else if constexpr (solver == plssvm::solver_type::cg_explicit || solver == plssvm::solver_type::cg_streaming) {
         // run the assemble the kernel matrix kernels
         const std::vector<plssvm::detail::move_only_any> kernel_matrix_d = svm.assemble_kernel_matrix(solver, params, data, q_red, QA_cost);
         ASSERT_EQ(kernel_matrix_d.size(), num_devices);
@@ -990,7 +1001,9 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) {
                 // get result based on used backend
                 std::vector<plssvm::real_type> kernel_matrix{};
                 if constexpr (plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::hpx) {
-                    kernel_matrix = plssvm::detail::move_only_any_cast<std::vector<plssvm::real_type>>(kernel_matrix_d[device_id]);  // std::vector
+                    const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast<const std::unique_ptr<plssvm::real_type[]> &>(kernel_matrix_d[device_id]);  // std::unique_ptr<plssvm::real_type[]>
+                    kernel_matrix.resize(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));
+                    std::memcpy(kernel_matrix.data(), kernel_matrix_d_ptr.get(), kernel_matrix.size() * sizeof(plssvm::real_type));
                 } else {
                     const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast<const device_ptr_type &>(kernel_matrix_d[device_id]);  // device_ptr -> convert it to a std::vector
                     kernel_matrix.resize(kernel_matrix_d_ptr.size_padded());
diff --git a/tests/backends/generic_csvm_tests.hpp b/tests/backends/generic_csvm_tests.hpp
index 84b9b7ad9..36c0d5669 100644
--- a/tests/backends/generic_csvm_tests.hpp
+++ b/tests/backends/generic_csvm_tests.hpp
@@ -81,14 +81,15 @@ TYPED_TEST_P(GenericBackendCSVM, blas_level_3_kernel_explicit) {
 
         const std::size_t specific_num_rows = dist.place_specific_num_rows(device);
         const std::size_t row_offset = dist.place_row_offset(device);
-        device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_temp);
+        device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_temp);
         const std::size_t num_mirror_rows = num_rows - row_offset - specific_num_rows;
         if (num_mirror_rows > 0) {
-            device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_temp);
+            device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_temp);
         }
 
         C_res += C_temp;
     }
+    C_res.restore_padding();
 
     // calculate correct results
     const plssvm::aos_matrix<plssvm::real_type> kernel_matrix_gemm_padded = ground_truth::assemble_full_kernel_matrix(params, data.data(), q_red, QA_cost);
@@ -112,6 +113,7 @@ TYPED_TEST_P(GenericBackendCSVM, calculate_w) {
     const plssvm::detail::rectangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_data_points(), 1 };
 
     device_kernel_w_linear(w, weights, data.data(), dist.place_specific_num_rows(0), dist.place_row_offset(0));
+    w.restore_padding();
 
     // calculate correct results
     const plssvm::soa_matrix<plssvm::real_type> correct_w = ground_truth::calculate_w(weights, data.data());
@@ -160,22 +162,22 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, assemble_kernel_matrix_explicit)
 
     switch (kernel) {
         case plssvm::kernel_function_type::linear:
-            device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
+            device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
             break;
         case plssvm::kernel_function_type::polynomial:
-            device_kernel_assembly<plssvm::kernel_function_type::polynomial>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_assembly<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::rbf:
-            device_kernel_assembly<plssvm::kernel_function_type::rbf>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly<plssvm::kernel_function_type::rbf, plssvm::real_type>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::sigmoid:
-            device_kernel_assembly<plssvm::kernel_function_type::sigmoid>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_assembly<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::laplacian:
-            device_kernel_assembly<plssvm::kernel_function_type::laplacian>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly<plssvm::kernel_function_type::laplacian, plssvm::real_type>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::chi_squared:
-            device_kernel_assembly<plssvm::kernel_function_type::chi_squared>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
             break;
     }
     const std::vector<plssvm::real_type> correct_kernel_matrix = ground_truth::assemble_device_specific_kernel_matrix(params, data_matr, q_red, QA_cost, dist, 0);
@@ -226,19 +228,19 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, blas_level_3_kernel_implicit) {
             device_kernel_assembly_symm<plssvm::kernel_function_type::linear>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C);
             break;
         case plssvm::kernel_function_type::polynomial:
-            device_kernel_assembly_symm<plssvm::kernel_function_type::polynomial>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_assembly_symm<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::rbf:
-            device_kernel_assembly_symm<plssvm::kernel_function_type::rbf>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly_symm<plssvm::kernel_function_type::rbf, plssvm::real_type>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::sigmoid:
-            device_kernel_assembly_symm<plssvm::kernel_function_type::sigmoid>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_assembly_symm<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::laplacian:
-            device_kernel_assembly_symm<plssvm::kernel_function_type::laplacian>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly_symm<plssvm::kernel_function_type::laplacian, plssvm::real_type>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::chi_squared:
-            device_kernel_assembly_symm<plssvm::kernel_function_type::chi_squared>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly_symm<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
             break;
     }
 
@@ -282,21 +284,22 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, predict_values) {
             device_kernel_predict_linear(out, correct_w, rho, predict_points, device_specific_num_predict_points, row_offset);
             break;
         case plssvm::kernel_function_type::polynomial:
-            device_kernel_predict<plssvm::kernel_function_type::polynomial>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_predict<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::rbf:
-            device_kernel_predict<plssvm::kernel_function_type::rbf>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_predict<plssvm::kernel_function_type::rbf, plssvm::real_type>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::sigmoid:
-            device_kernel_predict<plssvm::kernel_function_type::sigmoid>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_predict<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::laplacian:
-            device_kernel_predict<plssvm::kernel_function_type::laplacian>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_predict<plssvm::kernel_function_type::laplacian, plssvm::real_type>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::chi_squared:
-            device_kernel_predict<plssvm::kernel_function_type::chi_squared>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_predict<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
             break;
     }
+    out.restore_padding();
 
     // check out for correctness
     const plssvm::aos_matrix<plssvm::real_type> correct_out = ground_truth::predict_values(params, correct_w, weights, rho, data_matr, predict_points);
@@ -337,45 +340,39 @@ TYPED_TEST_P(GenericBackendCSVMDeathTest, blas_level_3_kernel_explicit) {
     const std::size_t row_offset = dist.place_row_offset(0);
 
     {
-        // the A matrix must have the correct size
-        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, std::vector<plssvm::real_type>{}, B, beta, C), "A matrix may not be empty!");
-
         // the B matrix must have the correct shape
         const auto B_wrong = util::generate_random_matrix<plssvm::soa_matrix<plssvm::real_type>>(plssvm::shape{ std::min<std::size_t>(0ULL, num_rows - 1), std::min<std::size_t>(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE });
-        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
+        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
 
         // the C matrix must have the correct shape
         auto C_wrong = util::generate_random_matrix<plssvm::soa_matrix<plssvm::real_type>>(plssvm::shape{ std::min<std::size_t>(0ULL, num_rows - 1), std::min<std::size_t>(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE });
-        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
+        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
 
         // the place specific number of rows may not be too large
-        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, num_rows + 1, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
+        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, num_rows + 1, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
 
         // the row offset may not be too large
-        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, num_rows + 1, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
+        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, num_rows + 1, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
     }
     {
         const std::size_t num_mirror_rows = num_rows - row_offset - specific_num_rows;
 
-        // the A matrix must have the correct size
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, std::vector<plssvm::real_type>{}, B, beta, C), "A matrix may not be empty!");
-
         // the B matrix must have the correct shape
         const auto B_wrong = util::generate_random_matrix<plssvm::soa_matrix<plssvm::real_type>>(plssvm::shape{ std::min<std::size_t>(0ULL, num_rows - 1), std::min<std::size_t>(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE });
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
+        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
 
         // the C matrix must have the correct shape
         auto C_wrong = util::generate_random_matrix<plssvm::soa_matrix<plssvm::real_type>>(plssvm::shape{ std::min<std::size_t>(0ULL, num_rows - 1), std::min<std::size_t>(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE });
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
+        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
 
         // the place specific number of rows may not be too large
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, num_rows + 1, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
+        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, num_rows + 1, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
 
         // the mirror number of rows may not be too large
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_rows + 1, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
+        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_rows + 1, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
 
         // the row offset may not be too large
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, num_rows + 1, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
+        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, num_rows + 1, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
     }
 }
 
@@ -442,45 +439,44 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, assemble_kernel_matrix_e
     const std::size_t row_offset = dist.place_row_offset(0);
 
     // helper lambda to reduce the amount of needed switches!
-    const auto run_assembly = [=](const plssvm::parameter &params_p, std::vector<plssvm::real_type> &kernel_matrix_p, const plssvm::soa_matrix<plssvm::real_type> &data_p, const std::size_t device_specific_num_rows_p, const std::size_t row_offset_p, const std::vector<plssvm::real_type> &q_red_p, const plssvm::real_type QA_cost_p) {
+    const auto run_assembly = [=](const plssvm::parameter &params_p, plssvm::real_type *kernel_matrix_p, const plssvm::soa_matrix<plssvm::real_type> &data_p, const std::size_t device_specific_num_rows_p, const std::size_t row_offset_p, const std::vector<plssvm::real_type> &q_red_p, const plssvm::real_type QA_cost_p) {
         switch (kernel) {
             case plssvm::kernel_function_type::linear:
                 device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost);
                 break;
             case plssvm::kernel_function_type::polynomial:
-                device_kernel_assembly<plssvm::kernel_function_type::polynomial>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::rbf:
-                device_kernel_assembly<plssvm::kernel_function_type::rbf>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::rbf, plssvm::real_type>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::sigmoid:
-                device_kernel_assembly<plssvm::kernel_function_type::sigmoid>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::laplacian:
-                device_kernel_assembly<plssvm::kernel_function_type::laplacian>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::laplacian, plssvm::real_type>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::chi_squared:
-                device_kernel_assembly<plssvm::kernel_function_type::chi_squared>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
         }
     };
 
     // check q_red size (must be equal to the number of data points - 1
-    EXPECT_DEATH(run_assembly(params, kernel_matrix, data.data(), device_specific_num_rows, row_offset, std::vector<plssvm::real_type>{}, QA_cost), fmt::format("Sizes mismatch!: 0 != {}", data.num_data_points() - 1));
+    EXPECT_DEATH(run_assembly(params, kernel_matrix.data(), data.data(), device_specific_num_rows, row_offset, std::vector<plssvm::real_type>{}, QA_cost), fmt::format("Sizes mismatch!: 0 != {}", data.num_data_points() - 1));
 
-    // check the kernel matrix size (depending on the usage of GEMM/SYMM)
-    std::vector<plssvm::real_type> ret;
-    EXPECT_DEATH(run_assembly(params, ret, data.data(), device_specific_num_rows, row_offset, q_red, QA_cost), "A matrix may not be empty!");
+    // the result kernel matrix must point to a valid chunk of memory
+    EXPECT_DEATH(run_assembly(params, nullptr, data.data(), device_specific_num_rows, row_offset, q_red, QA_cost), "The kernel matrix result pointer must be valid!");
 
     // check place specific number of rows
-    EXPECT_DEATH(run_assembly(params, kernel_matrix, data.data(), q_red.size() + 1, row_offset, q_red, QA_cost), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", q_red.size() + 1, q_red.size())));
+    EXPECT_DEATH(run_assembly(params, kernel_matrix.data(), data.data(), q_red.size() + 1, row_offset, q_red, QA_cost), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", q_red.size() + 1, q_red.size())));
 
     // check the row offset
-    EXPECT_DEATH(run_assembly(params, kernel_matrix, data.data(), device_specific_num_rows, q_red.size() + 1, q_red, QA_cost), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", q_red.size() + 1, q_red.size())));
+    EXPECT_DEATH(run_assembly(params, kernel_matrix.data(), data.data(), device_specific_num_rows, q_red.size() + 1, q_red, QA_cost), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", q_red.size() + 1, q_red.size())));
 
     // cost must not be 0.0 since 1.0 / cost is used
     params.cost = plssvm::real_type{ 0.0 };
-    EXPECT_DEATH(run_assembly(params, kernel_matrix, data.data(), device_specific_num_rows, row_offset, q_red, QA_cost), "cost must not be 0.0 since it is 1 / plssvm::cost!");
+    EXPECT_DEATH(run_assembly(params, kernel_matrix.data(), data.data(), device_specific_num_rows, row_offset, q_red, QA_cost), "cost must not be 0.0 since it is 1 / plssvm::cost!");
 }
 
 TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, blas_level_3_kernel_implicit) {
@@ -517,19 +513,19 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, blas_level_3_kernel_impl
                 device_kernel_assembly_symm<plssvm::kernel_function_type::linear>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p);
                 break;
             case plssvm::kernel_function_type::polynomial:
-                device_kernel_assembly_symm<plssvm::kernel_function_type::polynomial>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly_symm<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::rbf:
-                device_kernel_assembly_symm<plssvm::kernel_function_type::rbf>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly_symm<plssvm::kernel_function_type::rbf, plssvm::real_type>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::sigmoid:
-                device_kernel_assembly_symm<plssvm::kernel_function_type::sigmoid>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly_symm<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::laplacian:
-                device_kernel_assembly_symm<plssvm::kernel_function_type::laplacian>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly_symm<plssvm::kernel_function_type::laplacian, plssvm::real_type>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::chi_squared:
-                device_kernel_assembly_symm<plssvm::kernel_function_type::chi_squared>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly_symm<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
                 break;
         }
     };
@@ -611,19 +607,19 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, predict_values) {
                     // unreachable
                     break;
                 case plssvm::kernel_function_type::polynomial:
-                    device_kernel_predict<plssvm::kernel_function_type::polynomial>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                    device_kernel_predict<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                     break;
                 case plssvm::kernel_function_type::rbf:
-                    device_kernel_predict<plssvm::kernel_function_type::rbf>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
+                    device_kernel_predict<plssvm::kernel_function_type::rbf, plssvm::real_type>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
                     break;
                 case plssvm::kernel_function_type::sigmoid:
-                    device_kernel_predict<plssvm::kernel_function_type::sigmoid>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                    device_kernel_predict<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                     break;
                 case plssvm::kernel_function_type::laplacian:
-                    device_kernel_predict<plssvm::kernel_function_type::laplacian>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
+                    device_kernel_predict<plssvm::kernel_function_type::laplacian, plssvm::real_type>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
                     break;
                 case plssvm::kernel_function_type::chi_squared:
-                    device_kernel_predict<plssvm::kernel_function_type::chi_squared>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
+                    device_kernel_predict<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
                     break;
             }
         };
diff --git a/tests/backends/generic_device_ptr_tests.hpp b/tests/backends/generic_device_ptr_tests.hpp
index 3f2407005..2f142d49d 100644
--- a/tests/backends/generic_device_ptr_tests.hpp
+++ b/tests/backends/generic_device_ptr_tests.hpp
@@ -40,13 +40,14 @@ TYPED_TEST_SUITE_P(DevicePtr);
 TYPED_TEST_P(DevicePtr, default_construct) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
 
     // default construct device_ptr
     const device_ptr_type ptr{};
 
     // empty data
     EXPECT_FALSE(static_cast<bool>(ptr));
-    EXPECT_EQ(ptr.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_EQ(ptr.get(), data_ptr_type{});
     EXPECT_EQ(ptr.size(), 0);
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 0, 0 }));
     EXPECT_TRUE(ptr.empty());
@@ -55,15 +56,17 @@ TYPED_TEST_P(DevicePtr, default_construct) {
 TYPED_TEST_P(DevicePtr, construct_size) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr{ 42, queue };
+    const device_ptr_type ptr{ 42, queue, use_usm_allocations };
 
     // check data
     EXPECT_TRUE(static_cast<bool>(ptr));
-    EXPECT_NE(ptr.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(ptr.get(), data_ptr_type{});
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 0, 0 }));
@@ -73,15 +76,17 @@ TYPED_TEST_P(DevicePtr, construct_size) {
 TYPED_TEST_P(DevicePtr, construct_shape) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
 
     // check data
     EXPECT_TRUE(static_cast<bool>(ptr));
-    EXPECT_NE(ptr.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(ptr.get(), data_ptr_type{});
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 16 }));
     // check padding
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 0, 0 }));
@@ -91,15 +96,17 @@ TYPED_TEST_P(DevicePtr, construct_shape) {
 TYPED_TEST_P(DevicePtr, construct_shape_and_padding) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 4 }, queue };
+    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 4 }, queue, use_usm_allocations };
 
     // check data
     EXPECT_TRUE(static_cast<bool>(ptr));
-    EXPECT_NE(ptr.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(ptr.get(), data_ptr_type{});
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 16 }));
     // check padding
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 4, 4 }));
@@ -109,17 +116,19 @@ TYPED_TEST_P(DevicePtr, construct_shape_and_padding) {
 TYPED_TEST_P(DevicePtr, move_construct) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type first{ 42, queue };
+    device_ptr_type first{ 42, queue, use_usm_allocations };
     const device_ptr_type second{ std::move(first) };
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
     // EXPECT_EQ(second.queue(), queue);
-    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
@@ -127,7 +136,7 @@ TYPED_TEST_P(DevicePtr, move_construct) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -137,17 +146,19 @@ TYPED_TEST_P(DevicePtr, move_construct) {
 TYPED_TEST_P(DevicePtr, move_construct_with_padding) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue };
+    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
     const device_ptr_type second{ std::move(first) };
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
     // EXPECT_EQ(second.queue(), queue);
-    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
@@ -155,7 +166,7 @@ TYPED_TEST_P(DevicePtr, move_construct_with_padding) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -165,11 +176,13 @@ TYPED_TEST_P(DevicePtr, move_construct_with_padding) {
 TYPED_TEST_P(DevicePtr, move_assign) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type first{ 42, queue };
+    device_ptr_type first{ 42, queue, use_usm_allocations };
     device_ptr_type second;
 
     // move assign
@@ -177,7 +190,7 @@ TYPED_TEST_P(DevicePtr, move_assign) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
@@ -185,7 +198,7 @@ TYPED_TEST_P(DevicePtr, move_assign) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -195,11 +208,13 @@ TYPED_TEST_P(DevicePtr, move_assign) {
 TYPED_TEST_P(DevicePtr, move_assign_with_padding) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue };
+    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
     device_ptr_type second;
 
     // move assign
@@ -207,7 +222,7 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
@@ -215,7 +230,7 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -225,11 +240,13 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) {
 TYPED_TEST_P(DevicePtr, swap_member_function) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct two device_ptr
-    device_ptr_type first{ 42, queue };
+    device_ptr_type first{ 42, queue, use_usm_allocations };
     device_ptr_type second{};
 
     // swap both device_ptr using the member function
@@ -237,14 +254,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 42, 1 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -254,11 +271,13 @@ TYPED_TEST_P(DevicePtr, swap_member_function) {
 TYPED_TEST_P(DevicePtr, swap_member_function_with_padding) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct two device_ptr
-    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue };
+    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
     device_ptr_type second{};
 
     // swap both device_ptr using the member function
@@ -266,14 +285,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function_with_padding) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 46, 15 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -283,12 +302,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function_with_padding) {
 TYPED_TEST_P(DevicePtr, swap_free_function) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct two device_ptr
-    device_ptr_type first{ 42, queue };
-    device_ptr_type second;
+    device_ptr_type first{ 42, queue, use_usm_allocations };
+    device_ptr_type second{};
 
     // swap both device_ptr using the free function
     using std::swap;
@@ -296,14 +317,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 42, 1 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -313,12 +334,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function) {
 TYPED_TEST_P(DevicePtr, swap_free_function_with_padding) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct two device_ptr
-    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue };
-    device_ptr_type second;
+    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
+    device_ptr_type second{};
 
     // swap both device_ptr using the free function
     using std::swap;
@@ -326,14 +349,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function_with_padding) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 46, 15 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), typename device_ptr_type::device_pointer_type{});
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -345,9 +368,10 @@ TYPED_TEST_P(DevicePtr, operator_bool) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_TRUE(static_cast<bool>(ptr1));
 
     // construct empty device_ptr
@@ -360,17 +384,18 @@ TYPED_TEST_P(DevicePtr, size) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_EQ(ptr1.size(), 42);
 
     // construct device_ptr with shape
-    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr2.size(), 42 * 16);
 
     // construct device_ptr with shape and padding
-    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr3.size(), 42 * 16);
 
     // construct empty device_ptr
@@ -383,17 +408,18 @@ TYPED_TEST_P(DevicePtr, shape) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_EQ(ptr1.shape(), (plssvm::shape{ 42, 1 }));
 
     // construct device_ptr with shape
-    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr2.shape(), (plssvm::shape{ 42, 16 }));
 
     // construct device_ptr with shape and padding
-    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr3.shape(), (plssvm::shape{ 42, 16 }));
 
     // construct empty device_ptr
@@ -406,21 +432,22 @@ TYPED_TEST_P(DevicePtr, empty) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_FALSE(ptr1.empty());
 
     // construct device_ptr
-    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
     EXPECT_FALSE(ptr2.empty());
 
     // construct device_ptr
-    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_FALSE(ptr3.empty());
 
     // construct device_ptr
-    const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_TRUE(ptr4.empty());
 
     // construct empty device_ptr
@@ -433,9 +460,10 @@ TYPED_TEST_P(DevicePtr, padding) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue };
+    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 4, 5 }));
     ;
 }
@@ -445,9 +473,10 @@ TYPED_TEST_P(DevicePtr, size_padded) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue };
+    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr.size_padded(), (42 + 4) * (16 + 5));
 }
 
@@ -456,17 +485,18 @@ TYPED_TEST_P(DevicePtr, shape_padded) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_EQ(ptr1.shape_padded(), (plssvm::shape{ 42, 1 }));
 
     // construct device_ptr with shape
-    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr2.shape_padded(), (plssvm::shape{ 42, 16 }));
 
     // construct device_ptr with shape and padding
-    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr3.shape_padded(), (plssvm::shape{ 45, 19 }));
 
     // construct empty device_ptr
@@ -479,25 +509,26 @@ TYPED_TEST_P(DevicePtr, is_padded) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_FALSE(ptr1.is_padded());
 
     // construct device_ptr
-    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
     EXPECT_FALSE(ptr2.is_padded());
 
     // construct device_ptr
-    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_TRUE(ptr3.is_padded());
 
     // construct device_ptr
-    const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_TRUE(ptr4.is_padded());
 
     // construct device_ptr
-    const device_ptr_type ptr5{ plssvm::shape{ 42, 16 }, plssvm::shape{ 0, 0 }, queue };
+    const device_ptr_type ptr5{ plssvm::shape{ 42, 16 }, plssvm::shape{ 0, 0 }, queue, use_usm_allocations };
     EXPECT_FALSE(ptr5.is_padded());
 
     // construct empty device_ptr
@@ -511,9 +542,10 @@ TYPED_TEST_P(DevicePtr, memset) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // memset values to all ones
     ptr.memset(1, 2);
@@ -534,9 +566,10 @@ TYPED_TEST_P(DevicePtr, memset_with_numbytes) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // memset values to all ones
     ptr.memset(1, 2, 4 * sizeof(value_type));
@@ -556,9 +589,10 @@ TYPED_TEST_P(DevicePtr, memset_invalid_pos) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // perform invalid memset
     EXPECT_THROW_WHAT(ptr.memset(0, 10, 1),
@@ -572,9 +606,10 @@ TYPED_TEST_P(DevicePtr, fill) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // fill values with a specific value
     ptr.fill(value_type{ 42.0 }, 2);
@@ -595,9 +630,10 @@ TYPED_TEST_P(DevicePtr, fill_with_count) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // fill values with a specific value
     ptr.fill(value_type{ 42.0 }, 2, 4);
@@ -618,9 +654,10 @@ TYPED_TEST_P(DevicePtr, fill_invalid_pos) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // perform invalid fill
     EXPECT_THROW_WHAT(ptr.fill(value_type{ 42.0 }, 10, 1),
@@ -634,9 +671,10 @@ TYPED_TEST_P(DevicePtr, copy_vector) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(14, 42);
@@ -657,9 +695,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_copy_back_all) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -680,9 +719,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_copy_back_some) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -703,9 +743,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_copy_to_too_many) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -726,9 +767,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_too_few_host_elements) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // try copying data to the device with too few elements
     std::vector<value_type> data(8, 42);
@@ -741,9 +783,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_too_few_buffer_elements) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // try copying data back to the host with a buffer with too few elements
     std::vector<value_type> buffer(8);
@@ -756,9 +799,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_too_few_host_elements) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // try copying data to the device with too few elements
     std::vector<value_type> data(4, 42);
@@ -771,9 +815,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_too_few_buffer_elements) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // try copying data back to the host with a buffer with too few elements
     std::vector<value_type> buffer(4);
@@ -786,9 +831,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_strided) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(20);  // 5 x 4
@@ -811,9 +857,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_strided_invalid_spitch_width_combination) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(20);  // 5 x 4
@@ -828,9 +875,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_strided_submatrix_too_big) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(20);  // 5 x 4
@@ -845,9 +893,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(14, 42);
@@ -868,9 +917,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_with_count_copy_back_all) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -891,9 +941,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_with_count_copy_back_some) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -914,9 +965,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_with_count_copy_to_too_many) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -937,9 +989,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_strided) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(20);  // 5 x 4
@@ -962,9 +1015,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_strided_invalid_spitch_width_combination) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(20);  // 5 x 4
@@ -979,9 +1033,10 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(14, 42);
@@ -990,7 +1045,7 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device) {
     ptr.copy_to_device(data);
 
     // other device_ptr
-    device_ptr_type other_ptr{ 10, queue };
+    device_ptr_type other_ptr{ 10, queue, !use_usm_allocations };
     ptr.copy_to_other_device(other_ptr);
 
     // copy data back to the host
@@ -1007,9 +1062,10 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_too_few_device_elements)
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(14, 42);
@@ -1018,7 +1074,7 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_too_few_device_elements)
     ptr.copy_to_device(data);
 
     // other device_ptr
-    device_ptr_type other_ptr{ 5, queue };
+    device_ptr_type other_ptr{ 5, queue, !use_usm_allocations };
     EXPECT_THROW_WHAT(ptr.copy_to_other_device(other_ptr), plssvm::exception, "Buffer too small to perform copy (needed: 10, provided: 5)!");
 }
 
@@ -1028,9 +1084,10 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_with_count) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(14, 42);
@@ -1039,7 +1096,7 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_with_count) {
     ptr.copy_to_device(data);
 
     // other device_ptr
-    device_ptr_type other_ptr{ 5, queue };
+    device_ptr_type other_ptr{ 5, queue, !use_usm_allocations };
     ptr.copy_to_other_device(other_ptr, 1, 5);
 
     // copy data back to the host
@@ -1113,9 +1170,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     const plssvm::matrix<value_type, layout> data{ plssvm::shape{ 5, 3 }, value_type{ 42 } };
@@ -1146,9 +1204,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_with_padding) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 5, 3 }, plssvm::shape{ 4, 4 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 5, 3 }, plssvm::shape{ 4, 4 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     const plssvm::matrix<value_type, layout> data{ plssvm::shape{ 5, 3 }, value_type{ 42 }, plssvm::shape{ 4, 4 } };
@@ -1171,9 +1230,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_different_layouts) {
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
     constexpr plssvm::layout_type other_layout = layout == plssvm::layout_type::aos ? plssvm::layout_type::soa : plssvm::layout_type::aos;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     const plssvm::matrix<value_type, layout> data{ plssvm::shape{ 5, 3 }, value_type{ 42 } };
@@ -1204,9 +1264,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_too_few_host_elements) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // try copying data to the device with too few elements
     plssvm::matrix<value_type, layout> data{ plssvm::shape{ 2, 4 }, value_type{ 42 } };
@@ -1220,9 +1281,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_too_few_buffer_elements) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // try copying data back to the host with a buffer with too few elements
     plssvm::matrix<value_type, layout> buffer{ plssvm::shape{ 2, 4 } };
@@ -1236,9 +1298,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape(2, 3), queue };
+    device_ptr_type ptr{ plssvm::shape(2, 3), queue, use_usm_allocations };
 
     // create data to copy to the device
     const auto data = util::generate_specific_matrix<plssvm::matrix<value_type, layout>>(plssvm::shape{ 5, 3 });
@@ -1261,9 +1324,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_with_padding) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 3, 3 }, plssvm::shape{ 4, 4 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 3, 3 }, plssvm::shape{ 4, 4 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     const auto data = util::generate_specific_matrix<plssvm::matrix<value_type, layout>>(plssvm::shape{ 5, 3 }, plssvm::shape{ 4, 4 });
@@ -1291,9 +1355,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_different_layouts) {
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
     constexpr plssvm::layout_type other_layout = layout == plssvm::layout_type::aos ? plssvm::layout_type::soa : plssvm::layout_type::aos;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 3 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 3 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     const auto data = util::generate_specific_matrix<plssvm::matrix<value_type, layout>>(plssvm::shape{ 5, 3 });
@@ -1325,9 +1390,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_full_matrix_strided) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape(5, 3), queue };
+    device_ptr_type ptr{ plssvm::shape(5, 3), queue, use_usm_allocations };
 
     // create data to copy to the device
     const auto data = util::generate_specific_matrix<plssvm::matrix<value_type, layout>>(plssvm::shape{ 5, 3 });
@@ -1349,9 +1415,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_too_few_host_elements) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue, use_usm_allocations };
 
     // try copying data to the device with too few elements
     plssvm::matrix<value_type, layout> data{ plssvm::shape{ 2, 4 }, value_type{ 42 } };
@@ -1365,9 +1432,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_invalid_submatrix) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue, use_usm_allocations };
 
     // try copying data to the device with too few elements
     plssvm::matrix<value_type, layout> data{ plssvm::shape{ 4, 5 }, value_type{ 42 } };
@@ -1423,9 +1491,10 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_invalid_host_ptr) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // copy with invalid data pointer
     EXPECT_DEATH(ptr.copy_to_device(nullptr), ::testing::HasSubstr("Invalid host pointer for the data to copy!"));
@@ -1456,9 +1525,10 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_with_count_invalid_host_ptr) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // copy with invalid data pointer
     EXPECT_DEATH(ptr.copy_to_device(nullptr, 0, 10), ::testing::HasSubstr("Invalid host pointer for the data to copy!"));
@@ -1486,9 +1556,10 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_strided_invalid_host_ptr) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 1, queue };
+    device_ptr_type ptr{ 1, queue, use_usm_allocations };
 
     // copy with invalid data pointer
     EXPECT_DEATH(ptr.copy_to_device_strided(nullptr, 0, 0, 0), ::testing::HasSubstr("Invalid host pointer for the data to copy!"));
@@ -1513,10 +1584,11 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_to_other_device_invalid_device_ptr) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct default device_ptr
     device_ptr_type def{};
-    device_ptr_type ptr{ 2, queue };
+    device_ptr_type ptr{ 2, queue, use_usm_allocations };
 
     // copy with invalid device pointer
     EXPECT_DEATH(def.copy_to_other_device(ptr), ::testing::HasSubstr("Invalid data pointer! Maybe *this has been default constructed?"));
@@ -1528,10 +1600,11 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_to_other_device_with_count_invalid_device_
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct default device_ptr
     device_ptr_type def{};
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // copy with invalid device pointer
     EXPECT_DEATH(def.copy_to_other_device(ptr, 0, 10), ::testing::HasSubstr("Invalid data pointer! Maybe *this has been default constructed?"));
diff --git a/tests/backends/generic_gpu_csvm_tests.hpp b/tests/backends/generic_gpu_csvm_tests.hpp
index b68453801..5e4ebc4bf 100644
--- a/tests/backends/generic_gpu_csvm_tests.hpp
+++ b/tests/backends/generic_gpu_csvm_tests.hpp
@@ -410,7 +410,82 @@ TYPED_TEST_P(GenericGPUCSVMKernelFunction, run_assemble_kernel_matrix_explicit)
         const plssvm::detail::execution_range exec{ block, svm.get_max_work_group_size(device_id), grid, svm.get_max_grid_size(device_id) };
 
         // calculate the current part of the kernel matrix
-        const device_ptr_type kernel_matrix_d = svm.run_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost);
+        const device_ptr_type kernel_matrix_d = svm.run_assemble_kernel_matrix_explicit(device_id, exec, params, false, data_d, q_red_d, QA_cost);
+
+        // copy the kernel matrix back to the host
+        std::vector<plssvm::real_type> kernel_matrix(kernel_matrix_d.size());
+        kernel_matrix_d.copy_to_host(kernel_matrix);
+
+        // calculate ground truth
+        const std::vector<plssvm::real_type> correct_kernel_matrix = ground_truth::assemble_device_specific_kernel_matrix(params, data_matr, q_red, QA_cost, *svm.data_distribution_, device_id);
+
+        // check for correctness
+        ASSERT_EQ(kernel_matrix.size(), correct_kernel_matrix.size());
+        EXPECT_FLOATING_POINT_VECTOR_NEAR_EPS(kernel_matrix, correct_kernel_matrix, 1e6);
+    }
+}
+
+TYPED_TEST_P(GenericGPUCSVMKernelFunction, run_assemble_kernel_matrix_explicit_USM) {
+    using csvm_test_type = util::test_parameter_type_at_t<0, TypeParam>;
+    using mock_csvm_type = typename csvm_test_type::mock_csvm_type;
+    using device_ptr_type = typename csvm_test_type::device_ptr_type;
+    constexpr plssvm::kernel_function_type kernel = util::test_parameter_value_at_v<0, TypeParam>;
+
+    plssvm::parameter params{ plssvm::kernel_type = kernel };
+    if constexpr (kernel != plssvm::kernel_function_type::linear) {
+        params.gamma = plssvm::real_type{ 0.001 };
+    }
+    const plssvm::classification_data_set data{ PLSSVM_CLASSIFICATION_TEST_FILE };
+    auto data_matr{ data.data() };
+    if constexpr (kernel == plssvm::kernel_function_type::chi_squared) {
+        // chi-squared is well-defined for non-negative values only
+        data_matr = util::matrix_abs(data_matr);
+    }
+
+    // create C-SVM: must be done using the mock class since the member function to test is private or protected
+    const mock_csvm_type svm = util::construct_from_tuple<mock_csvm_type>(params, csvm_test_type::additional_arguments);
+    const std::size_t num_devices = svm.num_available_devices();
+    // be sure to use the correct data distribution
+    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(plssvm::mpi::communicator{}, data.num_data_points() - 1, num_devices);
+
+    // perform dimensional reduction
+    const auto [q_red, QA_cost] = ground_truth::perform_dimensional_reduction(params, data_matr);
+
+    for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+        SCOPED_TRACE(fmt::format("device_id {} ({}/{})", device_id, device_id + 1, num_devices));
+
+        // check whether the current device is responsible for at least one data point!
+        if (svm.data_distribution_->place_specific_num_rows(device_id) == 0) {
+            continue;
+        }
+        auto &device = svm.devices_[device_id];
+
+        // upload complete A and q_red to each device
+        device_ptr_type data_d{ data_matr.shape(), data_matr.padding(), device };
+        data_d.copy_to_device(data_matr);
+
+        device_ptr_type q_red_d{ q_red.size() + plssvm::PADDING_SIZE, device };
+        q_red_d.copy_to_device(q_red, 0, q_red.size());
+
+        // kernel launch specific sizes
+        const unsigned long long num_rows_reduced = data_matr.shape().x;
+        const unsigned long long device_specific_num_rows = svm.data_distribution_->place_specific_num_rows(device_id);
+        const unsigned long long device_row_offset = svm.data_distribution_->place_row_offset(device_id);
+
+        // the block dimension is THREAD_BLOCK_SIZE x THREAD_BLOCK_SIZE
+        const plssvm::detail::dim_type block{ std::size_t{ plssvm::THREAD_BLOCK_SIZE }, std::size_t{ plssvm::THREAD_BLOCK_SIZE } };
+
+        // define the full execution grid
+        const plssvm::detail::dim_type grid{
+            static_cast<std::size_t>(std::ceil(static_cast<double>(num_rows_reduced - device_row_offset) / static_cast<double>(block.x * plssvm::INTERNAL_BLOCK_SIZE))),
+            static_cast<std::size_t>(std::ceil(static_cast<double>(device_specific_num_rows) / static_cast<double>(block.y * plssvm::INTERNAL_BLOCK_SIZE)))
+        };
+
+        // create the final execution range
+        const plssvm::detail::execution_range exec{ block, svm.get_max_work_group_size(device_id), grid, svm.get_max_grid_size(device_id) };
+
+        // calculate the current part of the kernel matrix
+        const device_ptr_type kernel_matrix_d = svm.run_assemble_kernel_matrix_explicit(device_id, exec, params, true, data_d, q_red_d, QA_cost);
 
         // copy the kernel matrix back to the host
         std::vector<plssvm::real_type> kernel_matrix(kernel_matrix_d.size());
@@ -607,6 +682,7 @@ TYPED_TEST_P(GenericGPUCSVMKernelFunction, run_predict_kernel) {
 
 REGISTER_TYPED_TEST_SUITE_P(GenericGPUCSVMKernelFunction,
                             run_assemble_kernel_matrix_explicit,
+                            run_assemble_kernel_matrix_explicit_USM,
                             run_assemble_kernel_matrix_implicit_blas_level_3,
                             run_predict_kernel);
 
diff --git a/tests/backends/stdpar/stdpar_csvm.cpp b/tests/backends/stdpar/stdpar_csvm.cpp
index 83c881229..06f9cc81c 100644
--- a/tests/backends/stdpar/stdpar_csvm.cpp
+++ b/tests/backends/stdpar/stdpar_csvm.cpp
@@ -15,13 +15,13 @@
 #include "plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::stdpar::device_kernel_assembly
 #include "plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::stdpar::device_kernel_assembly_symm
 #include "plssvm/backends/stdpar/kernel/predict_kernel.hpp"                           // plssvm::stdpar::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
-#include "plssvm/constants.hpp"                                                       // plssvm::PADDING_SIZE
+#include "plssvm/constants.hpp"                                                       // plssvm::real_type
 #include "plssvm/data_set/classification_data_set.hpp"                                // plssvm::classification_data_set
 #include "plssvm/detail/arithmetic_type_name.hpp"                                     // plssvm::detail::arithmetic_type_name
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/type_list.hpp"                                                // plssvm::detail::supported_label_types
 #include "plssvm/kernel_function_types.hpp"                                           // plssvm::kernel_function_type
-#include "plssvm/matrix.hpp"                                                          // plssvm::soa_matrix
+#include "plssvm/matrix.hpp"                                                          // plssvm::soa_matrix. plssvm::aos_matrix
 #include "plssvm/parameter.hpp"                                                       // plssvm::parameter, plssvm::detail::parameter, plssvm::kernel_type, plssvm::cost
 #include "plssvm/shape.hpp"                                                           // plssvm::shape
 #include "plssvm/target_platforms.hpp"                                                // plssvm::target_platform
@@ -93,14 +93,151 @@ INSTANTIATE_TYPED_TEST_SUITE_P(stdparCSVMDeathTest, GenericCSVMSolverDeathTest,
 INSTANTIATE_TYPED_TEST_SUITE_P(stdparCSVMDeathTest, GenericCSVMKernelFunctionDeathTest, stdpar_kernel_function_type_gtest, naming::test_parameter_to_name);
 INSTANTIATE_TYPED_TEST_SUITE_P(stdparCSVMDeathTest, GenericCSVMSolverKernelFunctionDeathTest, stdpar_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
 
+// TODO: better without that much code cuplication
 // define the exact functions to be used in the generic header
-using plssvm::stdpar::detail::device_kernel_assembly;
-using plssvm::stdpar::detail::device_kernel_assembly_symm;
-using plssvm::stdpar::detail::device_kernel_predict;
-using plssvm::stdpar::detail::device_kernel_predict_linear;
-using plssvm::stdpar::detail::device_kernel_symm;
-using plssvm::stdpar::detail::device_kernel_symm_mirror;
-using plssvm::stdpar::detail::device_kernel_w_linear;
+template <plssvm::kernel_function_type kernel_function, typename... Args>
+void device_kernel_assembly(plssvm::real_type *kernel_matrix, const plssvm::soa_matrix<plssvm::real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector<plssvm::real_type> &q, const plssvm::real_type QA_cost, const plssvm::real_type cost, Args... kernel_function_parameter) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_assembly<plssvm::target_platform::gpu_nvidia, kernel_function, Args...>{}(kernel_matrix, data, device_num_rows, device_row_offset, q, QA_cost, cost, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_assembly<plssvm::target_platform::gpu_amd, kernel_function, Args...>{}(kernel_matrix, data, device_num_rows, device_row_offset, q, QA_cost, cost, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_assembly<plssvm::target_platform::gpu_intel, kernel_function, Args...>{}(kernel_matrix, data, device_num_rows, device_row_offset, q, QA_cost, cost, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_assembly<plssvm::target_platform::cpu, kernel_function, Args...>{}(kernel_matrix, data, device_num_rows, device_row_offset, q, QA_cost, cost, kernel_function_parameter...);
+            break;
+    }
+}
+
+void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const plssvm::real_type alpha, const plssvm::real_type *A, const plssvm::soa_matrix<plssvm::real_type> &B, const plssvm::real_type beta, plssvm::soa_matrix<plssvm::real_type> &C) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_symm<plssvm::target_platform::gpu_nvidia>{}(num_rows, num_rhs, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_symm<plssvm::target_platform::gpu_amd>{}(num_rows, num_rhs, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_symm<plssvm::target_platform::gpu_intel>{}(num_rows, num_rhs, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_symm<plssvm::target_platform::cpu>{}(num_rows, num_rhs, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+    }
+}
+
+void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const plssvm::real_type alpha, const plssvm::real_type *A, const plssvm::soa_matrix<plssvm::real_type> &B, const plssvm::real_type beta, plssvm::soa_matrix<plssvm::real_type> &C) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_symm_mirror<plssvm::target_platform::gpu_nvidia>{}(num_rows, num_rhs, num_mirror_rows, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_symm_mirror<plssvm::target_platform::gpu_amd>{}(num_rows, num_rhs, num_mirror_rows, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_symm_mirror<plssvm::target_platform::gpu_intel>{}(num_rows, num_rhs, num_mirror_rows, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_symm_mirror<plssvm::target_platform::cpu>{}(num_rows, num_rhs, num_mirror_rows, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+    }
+}
+
+template <plssvm::kernel_function_type kernel_function, typename... Args>
+void device_kernel_assembly_symm(const plssvm::real_type alpha, const std::vector<plssvm::real_type> &q, const plssvm::soa_matrix<plssvm::real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const plssvm::real_type QA_cost, const plssvm::real_type cost, const plssvm::soa_matrix<plssvm::real_type> &B, plssvm::soa_matrix<plssvm::real_type> &C, Args... kernel_function_parameter) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_assembly_symm<plssvm::target_platform::gpu_nvidia, kernel_function, Args...>{}(alpha, q, data, device_num_rows, device_row_offset, QA_cost, cost, B, C, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_assembly_symm<plssvm::target_platform::gpu_amd, kernel_function, Args...>{}(alpha, q, data, device_num_rows, device_row_offset, QA_cost, cost, B, C, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_assembly_symm<plssvm::target_platform::gpu_intel, kernel_function, Args...>{}(alpha, q, data, device_num_rows, device_row_offset, QA_cost, cost, B, C, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_assembly_symm<plssvm::target_platform::cpu, kernel_function, Args...>{}(alpha, q, data, device_num_rows, device_row_offset, QA_cost, cost, B, C, kernel_function_parameter...);
+            break;
+    }
+}
+
+void device_kernel_w_linear(plssvm::soa_matrix<plssvm::real_type> &w, const plssvm::aos_matrix<plssvm::real_type> &alpha, const plssvm::soa_matrix<plssvm::real_type> &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_w_linear<plssvm::target_platform::gpu_nvidia>{}(w, alpha, support_vectors, device_num_sv, device_sv_offset);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_w_linear<plssvm::target_platform::gpu_amd>{}(w, alpha, support_vectors, device_num_sv, device_sv_offset);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_w_linear<plssvm::target_platform::gpu_intel>{}(w, alpha, support_vectors, device_num_sv, device_sv_offset);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_w_linear<plssvm::target_platform::cpu>{}(w, alpha, support_vectors, device_num_sv, device_sv_offset);
+            break;
+    }
+}
+
+void device_kernel_predict_linear(plssvm::aos_matrix<plssvm::real_type> &prediction, const plssvm::soa_matrix<plssvm::real_type> &w, const std::vector<plssvm::real_type> &rho, const plssvm::soa_matrix<plssvm::real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_predict_linear<plssvm::target_platform::gpu_nvidia>{}(prediction, w, rho, predict_points, device_num_predict_points, device_row_offset);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_predict_linear<plssvm::target_platform::gpu_amd>{}(prediction, w, rho, predict_points, device_num_predict_points, device_row_offset);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_predict_linear<plssvm::target_platform::gpu_intel>{}(prediction, w, rho, predict_points, device_num_predict_points, device_row_offset);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_predict_linear<plssvm::target_platform::cpu>{}(prediction, w, rho, predict_points, device_num_predict_points, device_row_offset);
+            break;
+    }
+}
+
+template <plssvm::kernel_function_type kernel_function, typename... Args>
+void device_kernel_predict(plssvm::aos_matrix<plssvm::real_type> &prediction, const plssvm::aos_matrix<plssvm::real_type> &alpha, const std::vector<plssvm::real_type> &rho, const plssvm::soa_matrix<plssvm::real_type> &support_vectors, const plssvm::soa_matrix<plssvm::real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_predict<plssvm::target_platform::gpu_nvidia, kernel_function, Args...>{}(prediction, alpha, rho, support_vectors, predict_points, device_num_predict_points, device_row_offset, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_predict<plssvm::target_platform::gpu_amd, kernel_function, Args...>{}(prediction, alpha, rho, support_vectors, predict_points, device_num_predict_points, device_row_offset, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_predict<plssvm::target_platform::gpu_intel, kernel_function, Args...>{}(prediction, alpha, rho, support_vectors, predict_points, device_num_predict_points, device_row_offset, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_predict<plssvm::target_platform::cpu, kernel_function, Args...>{}(prediction, alpha, rho, support_vectors, predict_points, device_num_predict_points, device_row_offset, kernel_function_parameter...);
+            break;
+    }
+}
+
 #include "tests/backends/generic_csvm_tests.hpp"  // generic backend C-SVM tests to instantiate
 
 // generic non-GPU C-SVM tests
diff --git a/tests/detail/cmd/parser_predict.cpp b/tests/detail/cmd/parser_predict.cpp
index 8365494b7..8e5e83758 100644
--- a/tests/detail/cmd/parser_predict.cpp
+++ b/tests/detail/cmd/parser_predict.cpp
@@ -10,14 +10,14 @@
 
 #include "plssvm/detail/cmd/parser_predict.hpp"
 
-#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::cmd_parser_exit
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
-#include "plssvm/verbosity_levels.hpp"                       // plssvm::verbosity
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity
 
 #include "tests/custom_test_macros.hpp"      // EXPECT_CONVERSION_TO_STRING, EXPECT_THROW_WHAT
 #include "tests/detail/cmd/cmd_utility.hpp"  // util::ParameterBase
@@ -48,7 +48,7 @@ TEST_F(ParserPredict, minimal) {
     // check parsed values
     EXPECT_EQ(parser.backend, plssvm::backend_type::automatic);
     EXPECT_EQ(parser.target, plssvm::target_platform::automatic);
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
     EXPECT_EQ(parser.kokkos_execution_space, plssvm::kokkos::execution_space::automatic);
     EXPECT_FALSE(parser.strings_as_labels);
@@ -72,7 +72,7 @@ TEST_F(ParserPredict, minimal_output) {
         "backend: automatic\n"
         "target platform: automatic\n"
         "SYCL implementation type: automatic\n"
-        "SYCL kernel invocation type: automatic\n"
+        "SYCL data parallel kernel: automatic\n"
         "Kokkos execution space: automatic\n"
         "label_type: int (default)\n"
         "real_type: {}\n"
@@ -90,7 +90,7 @@ TEST_F(ParserPredict, all_arguments) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-predict", "--backend", "cuda", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_data_parallel_kernel", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
@@ -112,10 +112,10 @@ TEST_F(ParserPredict, all_arguments) {
     EXPECT_EQ(parser.backend, plssvm::backend_type::cuda);
     EXPECT_EQ(parser.target, plssvm::target_platform::gpu_nvidia);
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::work_group);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::work_group);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::dpcpp);
 #else
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -143,7 +143,7 @@ TEST_F(ParserPredict, all_arguments_output) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-predict", "--backend", "automatic", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_data_parallel_kernel", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
@@ -168,10 +168,10 @@ TEST_F(ParserPredict, all_arguments_output) {
     };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     correct += "SYCL implementation type: dpcpp\n"
-               "SYCL kernel invocation type: work_group\n";
+               "SYCL data parallel kernel: work_group\n";
 #else
     correct += "SYCL implementation type: automatic\n"
-               "SYCL kernel invocation type: automatic\n";
+               "SYCL data parallel kernel: automatic\n";
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     correct += fmt::format("Kokkos execution space: {}\n", space);
@@ -244,26 +244,26 @@ INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictTargetPlatform, ::testing::
 
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
 
-class ParserPredictSYCLKernelInvocation : public ParserPredict,
-                                          public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
+class ParserPredictSYCLDataParallelKernel : public ParserPredict,
+                                            public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
 
-TEST_P(ParserPredictSYCLKernelInvocation, parsing) {
+TEST_P(ParserPredictSYCLDataParallelKernel, parsing) {
     const auto &[flag, value] = GetParam();
-    // convert string to sycl::kernel_invocation_type
-    const auto sycl_kernel_invocation_type = util::convert_from_string<plssvm::sycl::kernel_invocation_type>(value);
+    // convert string to sycl::data_parallel_kernel
+    const auto sycl_data_parallel_kernel = util::convert_from_string<plssvm::sycl::data_parallel_kernel>(value);
     // create artificial command line arguments in test fixture
-    this->CreateCMDArgs({ "./plssvm-predict", flag, value, "data.libsvm" });
+    this->CreateCMDArgs({ "./plssvm-predict", flag, value, "data.libsvm", "data.libsvm.model" });
     // create parameter object
     const plssvm::detail::cmd::parser_predict parser{ this->get_comm(), this->get_argc(), this->get_argv() };
     // test for correctness
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, sycl_kernel_invocation_type);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, sycl_data_parallel_kernel);
 }
 
 // clang-format off
-INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictSYCLKernelInvocation, ::testing::Combine(
-                ::testing::Values("--sycl_kernel_invocation_type"),
+INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictSYCLDataParallelKernel, ::testing::Combine(
+                ::testing::Values("--sycl_data_parallel_kernel"),
                 ::testing::Values("automatic", "auto", "basic", "nd_range", "work_group", "hierarchical", "scoped")),
-                naming::pretty_print_parameter_flag_and_value<ParserPredictSYCLKernelInvocation>);
+                naming::pretty_print_parameter_flag_and_value<ParserPredictSYCLDataParallelKernel>);
 // clang-format on
 
 class ParserPredictSYCLImplementation : public ParserPredict,
diff --git a/tests/detail/cmd/parser_train.cpp b/tests/detail/cmd/parser_train.cpp
index 36e70228c..fbbeadd2b 100644
--- a/tests/detail/cmd/parser_train.cpp
+++ b/tests/detail/cmd/parser_train.cpp
@@ -10,19 +10,19 @@
 
 #include "plssvm/detail/cmd/parser_train.hpp"
 
-#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/classification_types.hpp"                   // plssvm::classification_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::cmd_parser_exit
-#include "plssvm/gamma.hpp"                                  // plssvm::gamma_type
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/solver_types.hpp"                           // plssvm::solver_type
-#include "plssvm/svm_types.hpp"                              // plssvm::svm_type
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
-#include "plssvm/verbosity_levels.hpp"                       // plssvm::verbosity
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/classification_types.hpp"                 // plssvm::classification_type
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
+#include "plssvm/gamma.hpp"                                // plssvm::gamma_type
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/solver_types.hpp"                         // plssvm::solver_type
+#include "plssvm/svm_types.hpp"                            // plssvm::svm_type
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity
 
 #include "tests/custom_test_macros.hpp"      // EXPECT_CONVERSION_TO_STRING, EXPECT_THROW_WHAT
 #include "tests/detail/cmd/cmd_utility.hpp"  // util::ParameterBase
@@ -61,7 +61,7 @@ TEST_F(ParserTrain, minimal) {
     EXPECT_EQ(parser.backend, plssvm::backend_type::automatic);
     EXPECT_EQ(parser.target, plssvm::target_platform::automatic);
     EXPECT_EQ(parser.solver, plssvm::solver_type::automatic);
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
     EXPECT_EQ(parser.kokkos_execution_space, plssvm::kokkos::execution_space::automatic);
     EXPECT_FALSE(parser.strings_as_labels);
@@ -91,7 +91,7 @@ TEST_F(ParserTrain, minimal_output) {
         "target platform: automatic\n"
         "solver: automatic\n"
         "SYCL implementation type: automatic\n"
-        "SYCL kernel invocation type: automatic\n"
+        "SYCL data parallel kernel: automatic\n"
         "Kokkos execution space: automatic\n"
         "classification_type: one vs. all\n"
         "label_type: int\n"
@@ -108,7 +108,7 @@ TEST_F(ParserTrain, all_arguments) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-train", "--svm_type", "1", "--kernel_type", "1", "--degree", "2", "--gamma", "1.5", "--coef0", "-1.5", "--cost", "2", "--epsilon", "1e-12", "--max_iter", "100", "--classification", "oao", "--solver", "cg_implicit", "--backend", "cuda", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_data_parallel_kernel", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
@@ -142,10 +142,10 @@ TEST_F(ParserTrain, all_arguments) {
     EXPECT_EQ(parser.target, plssvm::target_platform::gpu_nvidia);
     EXPECT_EQ(parser.solver, plssvm::solver_type::cg_implicit);
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::work_group);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::work_group);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::dpcpp);
 #else
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -170,7 +170,7 @@ TEST_F(ParserTrain, all_arguments_output) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-train", "--svm_type", "1", "--kernel_type", "1", "--degree", "2", "--gamma", "1.5", "--coef0", "-1.5", "--cost", "2", "--epsilon", "1e-12", "--max_iter", "100", "--classification", "oao", "--solver", "cg_implicit", "--backend", "automatic", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_data_parallel_kernel", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const std::string space = fmt::format("{}", plssvm::kokkos::list_available_execution_spaces()[1]);  // [0] would be automatic
@@ -203,10 +203,10 @@ TEST_F(ParserTrain, all_arguments_output) {
         "solver: cg_implicit\n";
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     correct += "SYCL implementation type: dpcpp\n"
-               "SYCL kernel invocation type: work_group\n";
+               "SYCL data parallel kernel: work_group\n";
 #else
     correct += "SYCL implementation type: automatic\n"
-               "SYCL kernel invocation type: automatic\n";
+               "SYCL data parallel kernel: automatic\n";
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     correct += fmt::format("Kokkos execution space: {}\n", space);
@@ -460,7 +460,7 @@ TEST_P(ParserTrainSolver, parsing) {
 // clang-format off
 INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainSolver, ::testing::Combine(
                 ::testing::Values("-l", "--solver"),
-                ::testing::Values(plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_implicit)),
+                ::testing::Values(plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_streaming, plssvm::solver_type::cg_implicit)),
                 naming::pretty_print_parameter_flag_and_value<ParserTrainSolver>);
 // clang-format on
 
@@ -530,26 +530,26 @@ INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainTargetPlatform, ::testing::Comb
 
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
 
-class ParserTrainSYCLKernelInvocation : public ParserTrain,
-                                        public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
+class ParserTrainSYCLDataParallelKernel : public ParserTrain,
+                                          public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
 
-TEST_P(ParserTrainSYCLKernelInvocation, parsing) {
+TEST_P(ParserTrainSYCLDataParallelKernel, parsing) {
     const auto &[flag, value] = GetParam();
-    // convert string to sycl::kernel_invocation_type
-    const auto sycl_kernel_invocation_type = util::convert_from_string<plssvm::sycl::kernel_invocation_type>(value);
+    // convert string to sycl::data_parallel_kernel
+    const auto sycl_data_parallel_kernel = util::convert_from_string<plssvm::sycl::data_parallel_kernel>(value);
     // create artificial command line arguments in test fixture
     this->CreateCMDArgs({ "./plssvm-train", flag, value, "data.libsvm" });
     // create parameter object
     const plssvm::detail::cmd::parser_train parser{ this->get_comm(), this->get_argc(), this->get_argv() };
     // test for correctness
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, sycl_kernel_invocation_type);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, sycl_data_parallel_kernel);
 }
 
 // clang-format off
-INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainSYCLKernelInvocation, ::testing::Combine(
-                ::testing::Values("--sycl_kernel_invocation_type"),
+INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainSYCLDataParallelKernel, ::testing::Combine(
+                ::testing::Values("--sycl_data_parallel_kernel"),
                 ::testing::Values("automatic", "auto", "basic", "nd_range", "work_group", "hierarchical", "scoped")),
-                naming::pretty_print_parameter_flag_and_value<ParserTrainSYCLKernelInvocation>);
+                naming::pretty_print_parameter_flag_and_value<ParserTrainSYCLDataParallelKernel>);
 // clang-format on
 
 class ParserTrainSYCLImplementation : public ParserTrain,
@@ -826,7 +826,7 @@ TEST_P(ParserTrainOutput, parsing) {
         "target platform: automatic\n"
         "solver: automatic\n"
         "SYCL implementation type: automatic\n"
-        "SYCL kernel invocation type: automatic\n"
+        "SYCL data parallel kernel: automatic\n"
         "Kokkos execution space: automatic\n"
         "classification_type: one vs. all\n"
         "label_type: int\n"
diff --git a/tests/detail/tracking/performance_tracker.cpp b/tests/detail/tracking/performance_tracker.cpp
index 31a0c3d83..9d7f6fd47 100644
--- a/tests/detail/tracking/performance_tracker.cpp
+++ b/tests/detail/tracking/performance_tracker.cpp
@@ -488,7 +488,7 @@ TEST_F(PerformanceTracker, add_parser_predict_tracking_entry) {
     // check entries for correctness
     EXPECT_EQ(entries.size(), 1);
 
-    ASSERT_EQ(entries.at("parameter").size(), 10);
+    ASSERT_EQ(entries.at("parameter").size(), 11);
 }
 
 TEST_F(PerformanceTracker, add_parser_scale_tracking_entry) {
diff --git a/tests/mpi/communicator.cpp b/tests/mpi/communicator.cpp
index 30078c8a5..f5f2ff5b9 100644
--- a/tests/mpi/communicator.cpp
+++ b/tests/mpi/communicator.cpp
@@ -109,6 +109,14 @@ TEST(MPICommunicator, is_mpi_enabled) {
 #endif
 }
 
+TEST(MPICommunicator, is_mpi_parallel) {
+    // create a default constructed MPI communicator
+    const plssvm::mpi::communicator comm{};
+
+    // always false since MPI is disabled
+    EXPECT_FALSE(comm.is_mpi_parallel());
+}
+
 TEST(MPICommunicator, is_main_rank) {
     // create a default constructed MPI communicator
     const plssvm::mpi::communicator comm{};
diff --git a/tests/parameter.cpp b/tests/parameter.cpp
index 940aa7d08..ef8a8d957 100644
--- a/tests/parameter.cpp
+++ b/tests/parameter.cpp
@@ -10,13 +10,13 @@
 
 #include "plssvm/parameter.hpp"
 
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/detail/arithmetic_type_name.hpp"            // plssvm::detail::arithmetic_type_name
-#include "plssvm/gamma.hpp"                                  // plssvm::gamma_coefficient_type
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/detail/arithmetic_type_name.hpp"          // plssvm::detail::arithmetic_type_name
+#include "plssvm/gamma.hpp"                                // plssvm::gamma_coefficient_type
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
 
 #include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_FLOATING_POINT_EQ
 
@@ -99,7 +99,7 @@ TEST(Parameter, construct_parameter_and_named_args) {
     const plssvm::parameter param{ param_base,
                                    plssvm::kernel_type = plssvm::kernel_function_type::rbf,
                                    plssvm::sycl_implementation_type = plssvm::sycl::implementation_type::adaptivecpp,
-                                   plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group,
+                                   plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group,
                                    plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda };
 
     // test default values
diff --git a/tests/solver_types.cpp b/tests/solver_types.cpp
index acf5a5464..f0fc68e03 100644
--- a/tests/solver_types.cpp
+++ b/tests/solver_types.cpp
@@ -22,12 +22,13 @@ TEST(SolverType, to_string) {
     // check conversions to std::string
     EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::automatic, "automatic");
     EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::cg_explicit, "cg_explicit");
+    EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::cg_streaming, "cg_streaming");
     EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::cg_implicit, "cg_implicit");
 }
 
 TEST(SolverType, to_string_unknown) {
     // check conversions to std::string from unknown solver_type
-    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::solver_type>(3), "unknown");
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::solver_type>(4), "unknown");
 }
 
 // check whether the std::string -> plssvm::solver_type conversions are correct
@@ -39,6 +40,8 @@ TEST(SolverType, from_string) {
     EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::solver_type::automatic);
     EXPECT_CONVERSION_FROM_STRING("cg_explicit", plssvm::solver_type::cg_explicit);
     EXPECT_CONVERSION_FROM_STRING("CG_Explicit", plssvm::solver_type::cg_explicit);
+    EXPECT_CONVERSION_FROM_STRING("cg_streaming", plssvm::solver_type::cg_streaming);
+    EXPECT_CONVERSION_FROM_STRING("CG_Streaming", plssvm::solver_type::cg_streaming);
     EXPECT_CONVERSION_FROM_STRING("cg_implicit", plssvm::solver_type::cg_implicit);
     EXPECT_CONVERSION_FROM_STRING("CG_Implicit", plssvm::solver_type::cg_implicit);
 }
diff --git a/tests/types_to_test.hpp b/tests/types_to_test.hpp
index 6257bcf29..fedddd232 100644
--- a/tests/types_to_test.hpp
+++ b/tests/types_to_test.hpp
@@ -504,16 +504,9 @@ constexpr std::array<plssvm::classification_type, 2> classification_types_to_tes
     plssvm::classification_type::oaa, plssvm::classification_type::oao
 };
 /// A list of all available solver types.
-constexpr std::array<plssvm::solver_type,
-#if defined(__NVCOMPILER)  // TODO: fixes nvc++ compilation error, that SHOULD NOT be there in the first place
-                     4
-#else
-                     3
-#endif
-                     >
-    solver_types_to_test{
-        plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_implicit
-    };
+constexpr std::array<plssvm::solver_type, 4> solver_types_to_test{
+    plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_streaming, plssvm::solver_type::cg_implicit
+};
 
 /// A list of all solver types.
 using solver_type_list = cartesian_value_product_t<solver_types_to_test>;
diff --git a/utility_scripts/performance_analysis.py b/utility_scripts/performance_analysis.py
index 48a5cb179..b1942104a 100644
--- a/utility_scripts/performance_analysis.py
+++ b/utility_scripts/performance_analysis.py
@@ -112,7 +112,7 @@ def fit_model_with_timeout(csvm, data, eps):
 
         if backend == plssvm.BackendType.SYCL:
             # special case SYCL backend
-            # add all available SYCL implementation and both kernel invocation types
+            # add all available SYCL implementation types and data parallel kernels
             available_sycl_implementations = plssvm.sycl.list_available_sycl_implementations()
             available_sycl_implementations.reverse()
             for sycl_impl in available_sycl_implementations:
@@ -120,7 +120,7 @@ def fit_model_with_timeout(csvm, data, eps):
                 if sycl_impl == plssvm.sycl.ImplementationType.AUTOMATIC:
                     continue
                 available_backends.append((backend, { "sycl_implementation_type":    sycl_impl,
-                                                      "sycl_kernel_invocation_type": plssvm.sycl.KernelInvocationType.WORK_GROUP }))
+                                                      "sycl_data_parallel_kernel": plssvm.sycl.DataParallelKernel.WORK_GROUP }))
         else:
             available_backends.append((backend, { }))