From 235cdcab222e49f203e4cf4d6c5377b6f2a7155a Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 2 Oct 2024 12:09:42 +0200
Subject: [PATCH 01/93] Add cg_streaming enum class case.

---
 include/plssvm/solver_types.hpp | 2 ++
 src/plssvm/solver_types.cpp     | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/include/plssvm/solver_types.hpp b/include/plssvm/solver_types.hpp
index 0dca4dad9..474db009e 100644
--- a/include/plssvm/solver_types.hpp
+++ b/include/plssvm/solver_types.hpp
@@ -32,6 +32,8 @@ enum class solver_type {
     automatic,
     /** Use the CG algorithm explicitly calculating the kernel matrix and fully storing it on the device. */
     cg_explicit,
+    /** Use the CG algorithm explicitly calculating the kernel matrix and fully storing it on the host. Realized using unified shared memory. */
+    cg_streaming,
     /** Use the CG algorithm implicitly recomputing the kernel matrix each CG iteration (smallest memory footprint). */
     cg_implicit
 };
diff --git a/src/plssvm/solver_types.cpp b/src/plssvm/solver_types.cpp
index c830728ec..82a70f589 100644
--- a/src/plssvm/solver_types.cpp
+++ b/src/plssvm/solver_types.cpp
@@ -23,6 +23,8 @@ std::ostream &operator<<(std::ostream &out, const solver_type solving) {
             return out << "automatic";
         case solver_type::cg_explicit:
             return out << "cg_explicit";
+        case solver_type::cg_streaming:
+            return out << "cg_streaming";
         case solver_type::cg_implicit:
             return out << "cg_implicit";
     }
@@ -38,6 +40,8 @@ std::istream &operator>>(std::istream &in, solver_type &solving) {
         solving = solver_type::automatic;
     } else if (str == "cg_explicit") {
         solving = solver_type::cg_explicit;
+    } else if (str == "cg_streaming") {
+        solving = solver_type::cg_streaming;
     } else if (str == "cg_implicit") {
         solving = solver_type::cg_implicit;
     } else {

From 0006b9ac8d3c9cc19041f860d2e31db8b5a6a1c9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 2 Oct 2024 12:15:14 +0200
Subject: [PATCH 02/93] Add device_ptr flag to enable shared/managed memory
 allocations.

---
 .../backends/CUDA/detail/device_ptr.cuh       |  7 +++--
 include/plssvm/backends/gpu_device_ptr.hpp    | 31 ++++++++++++-------
 src/plssvm/backends/CUDA/detail/device_ptr.cu | 18 ++++++-----
 3 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/include/plssvm/backends/CUDA/detail/device_ptr.cuh b/include/plssvm/backends/CUDA/detail/device_ptr.cuh
index de2d8546d..bb99e1ffe 100644
--- a/include/plssvm/backends/CUDA/detail/device_ptr.cuh
+++ b/include/plssvm/backends/CUDA/detail/device_ptr.cuh
@@ -32,6 +32,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -60,14 +61,14 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
      * @param[in] device the associated CUDA device
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(size_type size, queue_type device);
+    device_ptr(size_type size, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] device the associated CUDA device
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(plssvm::shape shape, queue_type device);
+    device_ptr(plssvm::shape shape, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
@@ -75,7 +76,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
      * @param[in] device the associated CUDA device
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp
index 7d364253b..b5d396051 100644
--- a/include/plssvm/backends/gpu_device_ptr.hpp
+++ b/include/plssvm/backends/gpu_device_ptr.hpp
@@ -32,7 +32,7 @@ template <typename T, typename queue_t, typename device_pointer_t, typename deri
 class gpu_device_ptr {
     // make sure only valid template types are used
     static_assert(detail::tuple_contains_v<T, detail::supported_real_types>,
-        "Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types.");
+                  "Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types.");
 
   public:
     /// The type of the values used in the device_ptr.
@@ -57,14 +57,14 @@ class gpu_device_ptr {
      * @param[in] size the size of the managed memory
      * @param[in] queue the queue (or similar) to manage the device_ptr
      */
-    gpu_device_ptr(size_type size, const queue_type queue);
+    gpu_device_ptr(size_type size, const queue_type queue, bool use_usm_allocations);
     /**
      * @brief Construct a device_ptr for the device managed by @p queue with the provided @p shape.
      * @details The managed memory size is: extents[0] * extents[1].
      * @param[in] shape the 2D size of the managed memory; size = shape.x * shape.y
      * @param[in] queue the queue (or similar) to manage the device_ptr
      */
-    gpu_device_ptr(plssvm::shape shape, const queue_type queue);
+    gpu_device_ptr(plssvm::shape shape, const queue_type queue, bool use_usm_allocations);
     /**
      * @brief Construct a device_ptr for the device managed by @p queue with the provided @p shape including @p padding.
      * @details The managed memory size is: (shape.x + padding.x) * (shape.y + padding.y).
@@ -72,7 +72,7 @@ class gpu_device_ptr {
      * @param[in] padding the padding applied to the extents
      * @param[in] queue the queue (or similar) to manage the device_ptr
      */
-    gpu_device_ptr(plssvm::shape shape, plssvm::shape padding, const queue_type queue);
+    gpu_device_ptr(plssvm::shape shape, plssvm::shape padding, const queue_type queue, bool use_usm_allocations);
 
     /**
      * @brief Delete copy-constructor to make device_ptr a move only type.
@@ -368,31 +368,36 @@ class gpu_device_ptr {
     plssvm::shape padding_{};
     /// The device pointer pointing to the managed memory.
     device_pointer_type data_{};
+    /// If true, use USM allocations automatically migrating the data between host and device.
+    bool use_usm_allocations_{};
 };
 
-
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
-gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const size_type size, const queue_type queue) :
+gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const size_type size, const queue_type queue, const bool use_usm_allocations) :
     queue_{ queue },
-    shape_{ plssvm::shape{ size, 1 } } { }
+    shape_{ plssvm::shape{ size, 1 } },
+    use_usm_allocations_{ use_usm_allocations } { }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
-gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const plssvm::shape shape, const queue_type queue) :
+gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const plssvm::shape shape, const queue_type queue, const bool use_usm_allocations) :
     queue_{ queue },
-    shape_{ shape } { }
+    shape_{ shape },
+    use_usm_allocations_{ use_usm_allocations } { }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
-gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type queue) :
+gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type queue, const bool use_usm_allocations) :
     queue_{ queue },
     shape_{ shape },
-    padding_{ padding } { }
+    padding_{ padding },
+    use_usm_allocations_{ use_usm_allocations } { }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::gpu_device_ptr(gpu_device_ptr &&other) noexcept :
     queue_{ std::exchange(other.queue_, queue_type{}) },
     shape_{ std::exchange(other.shape_, plssvm::shape{}) },
     padding_{ std::exchange(other.padding_, plssvm::shape{}) },
-    data_{ std::exchange(other.data_, device_pointer_type{}) } { }
+    data_{ std::exchange(other.data_, device_pointer_type{}) },
+    use_usm_allocations_{ std::exchange(other.use_usm_allocations_, false) } { }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 auto gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::operator=(gpu_device_ptr &&other) noexcept -> gpu_device_ptr & {
@@ -402,6 +407,7 @@ auto gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::opera
         shape_ = std::exchange(other.shape_, plssvm::shape{});
         padding_ = std::exchange(other.padding_, plssvm::shape{});
         data_ = std::exchange(other.data_, device_pointer_type{});
+        use_usm_allocations_ = std::exchange(other.use_usm_allocations_, false);
     }
     return *this;
 }
@@ -412,6 +418,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::swap(
     std::swap(shape_, other.shape_);
     std::swap(padding_, other.padding_);
     std::swap(data_, other.data_);
+    std::swap(use_usm_allocations_, other.use_usm_allocations_);
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
diff --git a/src/plssvm/backends/CUDA/detail/device_ptr.cu b/src/plssvm/backends/CUDA/detail/device_ptr.cu
index 5d7ba74bb..87d069409 100644
--- a/src/plssvm/backends/CUDA/detail/device_ptr.cu
+++ b/src/plssvm/backends/CUDA/detail/device_ptr.cu
@@ -25,21 +25,25 @@
 namespace plssvm::cuda::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const queue_type device) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const size_type size, const queue_type device, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const queue_type device) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const queue_type device, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device) :
-    base_type{ shape, padding, device } {
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device, const bool use_usm_allocations) :
+    base_type{ shape, padding, device, use_usm_allocations } {
     if (queue_ < 0 || queue_ >= static_cast<int>(get_device_count())) {
         throw backend_exception{ fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", get_device_count(), queue_) };
     }
     detail::set_device(queue_);
-    PLSSVM_CUDA_ERROR_CHECK(cudaMalloc(&data_, this->size_padded() * sizeof(value_type)))
+    if (use_usm_allocations_) {
+        PLSSVM_CUDA_ERROR_CHECK(cudaMallocManaged(&data_, this->size_padded() * sizeof(value_type)))
+    } else {
+        PLSSVM_CUDA_ERROR_CHECK(cudaMalloc(&data_, this->size_padded() * sizeof(value_type)))
+    }
     this->memset(0);
 }
 

From 3508e154af2c51a83884ee7b52fab5b43e1ce045 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 2 Oct 2024 12:15:30 +0200
Subject: [PATCH 03/93] Allocate kernel matrix using shared memory for
 cg_streaming.

---
 include/plssvm/backends/CUDA/csvm.hpp | 3 ++-
 include/plssvm/backends/gpu_csvm.hpp  | 6 ++++--
 include/plssvm/csvm.hpp               | 2 ++
 src/plssvm/backends/CUDA/csvm.cu      | 7 +++++--
 4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/plssvm/backends/CUDA/csvm.hpp b/include/plssvm/backends/CUDA/csvm.hpp
index 5e0eed30d..d18e3395a 100644
--- a/include/plssvm/backends/CUDA/csvm.hpp
+++ b/include/plssvm/backends/CUDA/csvm.hpp
@@ -22,6 +22,7 @@
 #include "plssvm/detail/memory_size.hpp"                  // plssvm::detail::memory_size
 #include "plssvm/detail/type_traits.hpp"                  // PLSSVM_REQUIRES
 #include "plssvm/parameter.hpp"                           // plssvm::parameter
+#include "plssvm/solver_types.hpp"                        // plssvm::solver_type
 #include "plssvm/target_platforms.hpp"                    // plssvm::target_platform
 
 #include <cstddef>      // std::size_t
@@ -152,7 +153,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, solver_type solver, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/gpu_csvm.hpp b/include/plssvm/backends/gpu_csvm.hpp
index cf2641b38..f24585706 100644
--- a/include/plssvm/backends/gpu_csvm.hpp
+++ b/include/plssvm/backends/gpu_csvm.hpp
@@ -146,7 +146,7 @@ class gpu_csvm : public ::plssvm::csvm {
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @return the explicit kernel matrix stored on the device (`[[nodiscard]]`)
      */
-    [[nodiscard]] virtual device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const = 0;
+    [[nodiscard]] virtual device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const execution_range &exec, const parameter &params, solver_type solver, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const = 0;
     /**
      * @brief Perform an explicit BLAS level 3 operation: `C = alpha * A * B + beta * C` where @p A, @p B, and @p C are matrices, and @p alpha and @p beta are scalars.
      * @param[in] device_id the device to run the kernel on
@@ -292,9 +292,10 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
                 // unreachable
                 break;
             case solver_type::cg_explicit:
+            case solver_type::cg_streaming:
                 {
                     // explicitly assemble the (potential partial) kernel matrix
-                    device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, data_d[device_id], q_red_d[device_id], QA_cost);
+                    device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, solver, data_d[device_id], q_red_d[device_id], QA_cost);
                     kernel_matrices_parts[device_id] = ::plssvm::detail::move_only_any{ std::move(kernel_matrix) };
                 }
                 break;
@@ -385,6 +386,7 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
                 // unreachable
                 break;
             case solver_type::cg_explicit:
+            case solver_type::cg_streaming:
                 {
                     const auto &A_d = detail::move_only_any_cast<const device_ptr_type &>(A[device_id]);
                     PLSSVM_ASSERT(!A_d.empty(), "The A matrix must not be empty!");
diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp
index 3e0ea2472..723c4bfd9 100644
--- a/include/plssvm/csvm.hpp
+++ b/include/plssvm/csvm.hpp
@@ -836,6 +836,8 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
             return failed_constraints;
         };
 
+        // TODO: LOGIC for cg_streaming!
+
         // select solver type based on the available memory
         // check whether the explicit (partial) kernel matrix can fit into each device memory
         if (const std::vector<std::size_t> failed_cg_explicit_constraints = check_sizes(total_memory_needed_explicit_per_device, usable_device_memory_per_device); failed_cg_explicit_constraints.empty()) {
diff --git a/src/plssvm/backends/CUDA/csvm.cu b/src/plssvm/backends/CUDA/csvm.cu
index 9eebc97e3..1e00f7edf 100644
--- a/src/plssvm/backends/CUDA/csvm.cu
+++ b/src/plssvm/backends/CUDA/csvm.cu
@@ -150,7 +150,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const solver_type solver, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const unsigned long long num_rows_reduced = data_d.shape().x - 1;
     const unsigned long long num_features = data_d.shape().y;
     const queue_type &device = devices_[device_id];
@@ -165,7 +165,10 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    // only store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, solver == solver_type::cg_streaming };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     // convert execution range block to CUDA's native dim3

From bf19526e30b095aa9705b75634fa0a8d964bd950 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 10:32:08 +0200
Subject: [PATCH 04/93] Use USM allocations in BLAS kernel and slightly change
 API.

---
 include/plssvm/backends/CUDA/csvm.hpp         |  2 +-
 include/plssvm/backends/gpu_csvm.hpp          | 11 ++++++-----
 src/plssvm/backends/CUDA/csvm.cu              |  4 ++--
 src/plssvm/backends/CUDA/detail/device_ptr.cu | 18 +++++++++++-------
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/include/plssvm/backends/CUDA/csvm.hpp b/include/plssvm/backends/CUDA/csvm.hpp
index d18e3395a..1cb1b8268 100644
--- a/include/plssvm/backends/CUDA/csvm.hpp
+++ b/include/plssvm/backends/CUDA/csvm.hpp
@@ -153,7 +153,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, solver_type solver, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/gpu_csvm.hpp b/include/plssvm/backends/gpu_csvm.hpp
index f24585706..9ea248dc1 100644
--- a/include/plssvm/backends/gpu_csvm.hpp
+++ b/include/plssvm/backends/gpu_csvm.hpp
@@ -141,12 +141,13 @@ class gpu_csvm : public ::plssvm::csvm {
      * @param[in] device_id the device to run the kernel on
      * @param[in] exec the execution range used in the device call
      * @param[in] params the parameters (e.g., kernel function) used to assemble the kernel matrix
+     * @param[in] use_usm_allocations if `true` use USM allocations for the `cg_streaming` implementation
      * @param[in] data_d the data set to create the kernel matrix from
      * @param[in] q_red_d the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @return the explicit kernel matrix stored on the device (`[[nodiscard]]`)
      */
-    [[nodiscard]] virtual device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const execution_range &exec, const parameter &params, solver_type solver, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const = 0;
+    [[nodiscard]] virtual device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const = 0;
     /**
      * @brief Perform an explicit BLAS level 3 operation: `C = alpha * A * B + beta * C` where @p A, @p B, and @p C are matrices, and @p alpha and @p beta are scalars.
      * @param[in] device_id the device to run the kernel on
@@ -295,7 +296,7 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
             case solver_type::cg_streaming:
                 {
                     // explicitly assemble the (potential partial) kernel matrix
-                    device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, solver, data_d[device_id], q_red_d[device_id], QA_cost);
+                    device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, solver == solver_type::cg_streaming, data_d[device_id], q_red_d[device_id], QA_cost);
                     kernel_matrices_parts[device_id] = ::plssvm::detail::move_only_any{ std::move(kernel_matrix) };
                 }
                 break;
@@ -331,7 +332,7 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
     // the partial C result from a specific device later stored on device 0 to perform the C reduction (inplace matrix addition)
     device_ptr_type partial_C_d{};
     if (num_devices > 1) {
-        partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0] };
+        partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0], solver == solver_type::cg_streaming };
     }
 
     // split memory allocation and memory copy!
@@ -344,8 +345,8 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
         const queue_type &device = devices_[device_id];
 
         // allocate memory on the device
-        B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device };
-        C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device };
+        B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device, solver == solver_type::cg_streaming };
+        C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device, solver == solver_type::cg_streaming };
     }
 
 #pragma omp parallel for ordered if (num_devices > 1)
diff --git a/src/plssvm/backends/CUDA/csvm.cu b/src/plssvm/backends/CUDA/csvm.cu
index 1e00f7edf..4d93723a7 100644
--- a/src/plssvm/backends/CUDA/csvm.cu
+++ b/src/plssvm/backends/CUDA/csvm.cu
@@ -150,7 +150,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const solver_type solver, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const unsigned long long num_rows_reduced = data_d.shape().x - 1;
     const unsigned long long num_features = data_d.shape().y;
     const queue_type &device = devices_[device_id];
@@ -168,7 +168,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     // only store the upper triangular matrix
     // if solver == solver_type::cg_explicit: store it explicitly
     // if solver == solver_type::cg_streaming: store it using USM
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device, solver == solver_type::cg_streaming };
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     // convert execution range block to CUDA's native dim3
diff --git a/src/plssvm/backends/CUDA/detail/device_ptr.cu b/src/plssvm/backends/CUDA/detail/device_ptr.cu
index 87d069409..a8aece30f 100644
--- a/src/plssvm/backends/CUDA/detail/device_ptr.cu
+++ b/src/plssvm/backends/CUDA/detail/device_ptr.cu
@@ -35,7 +35,7 @@ device_ptr<T>::device_ptr(const plssvm::shape shape, const queue_type device, co
 template <typename T>
 device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device, const bool use_usm_allocations) :
     base_type{ shape, padding, device, use_usm_allocations } {
-    if (queue_ < 0 || queue_ >= static_cast<int>(get_device_count())) {
+    if (queue_ < 0 || queue_ >= get_device_count()) {
         throw backend_exception{ fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", get_device_count(), queue_) };
     }
     detail::set_device(queue_);
@@ -97,9 +97,11 @@ void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const s
     PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
-    detail::set_device(queue_);
-    const size_type rcount = std::min(count, this->size_padded() - pos);
-    PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(data_ + pos, data_to_copy, rcount * sizeof(value_type), cudaMemcpyHostToDevice))
+    if (!use_usm_allocations_) {
+        detail::set_device(queue_);
+        const size_type rcount = std::min(count, this->size_padded() - pos);
+        PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(data_ + pos, data_to_copy, rcount * sizeof(value_type), cudaMemcpyHostToDevice))
+    }
 }
 
 template <typename T>
@@ -120,9 +122,11 @@ void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos,
     PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
-    detail::set_device(queue_);
-    const size_type rcount = std::min(count, this->size_padded() - pos);
-    PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(buffer, data_ + pos, rcount * sizeof(value_type), cudaMemcpyDeviceToHost))
+    if (!use_usm_allocations_) {
+        detail::set_device(queue_);
+        const size_type rcount = std::min(count, this->size_padded() - pos);
+        PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(buffer, data_ + pos, rcount * sizeof(value_type), cudaMemcpyDeviceToHost))
+    }
 }
 
 template <typename T>

From e403c62707289a393cd2edcb94651581935a6dd4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 10:44:54 +0200
Subject: [PATCH 05/93] Remove USM related if in copy functions.

---
 src/plssvm/backends/CUDA/detail/device_ptr.cu | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/plssvm/backends/CUDA/detail/device_ptr.cu b/src/plssvm/backends/CUDA/detail/device_ptr.cu
index a8aece30f..00f20f66e 100644
--- a/src/plssvm/backends/CUDA/detail/device_ptr.cu
+++ b/src/plssvm/backends/CUDA/detail/device_ptr.cu
@@ -97,11 +97,9 @@ void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const s
     PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
-    if (!use_usm_allocations_) {
-        detail::set_device(queue_);
-        const size_type rcount = std::min(count, this->size_padded() - pos);
-        PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(data_ + pos, data_to_copy, rcount * sizeof(value_type), cudaMemcpyHostToDevice))
-    }
+    detail::set_device(queue_);
+    const size_type rcount = std::min(count, this->size_padded() - pos);
+    PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(data_ + pos, data_to_copy, rcount * sizeof(value_type), cudaMemcpyHostToDevice))
 }
 
 template <typename T>
@@ -122,11 +120,9 @@ void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos,
     PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
-    if (!use_usm_allocations_) {
-        detail::set_device(queue_);
-        const size_type rcount = std::min(count, this->size_padded() - pos);
-        PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(buffer, data_ + pos, rcount * sizeof(value_type), cudaMemcpyDeviceToHost))
-    }
+    detail::set_device(queue_);
+    const size_type rcount = std::min(count, this->size_padded() - pos);
+    PLSSVM_CUDA_ERROR_CHECK(cudaMemcpy(buffer, data_ + pos, rcount * sizeof(value_type), cudaMemcpyDeviceToHost))
 }
 
 template <typename T>

From 766386070b9f8cb59b456fb9fb17fab9012a3c07 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 11:11:13 +0200
Subject: [PATCH 06/93] Use variable to specify whether USM allocations should
 be used.

---
 include/plssvm/backends/gpu_csvm.hpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/plssvm/backends/gpu_csvm.hpp b/include/plssvm/backends/gpu_csvm.hpp
index 9ea248dc1..089d8cbb0 100644
--- a/include/plssvm/backends/gpu_csvm.hpp
+++ b/include/plssvm/backends/gpu_csvm.hpp
@@ -230,6 +230,7 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
     PLSSVM_ASSERT(!q_red.empty(), "The q_red vector must not be empty!");
     PLSSVM_ASSERT(q_red.size() == A.num_rows() - 1, "The q_red size ({}) mismatches the number of data points after dimensional reduction ({})!", q_red.size(), A.num_rows() - 1);
 
+    const bool use_usm_allocations = solver == solver_type::cg_streaming;
     const std::size_t num_devices = this->num_available_devices();
     const std::size_t num_rows_reduced = A.shape().x - 1;
 
@@ -253,8 +254,8 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
         const queue_type &device = devices_[device_id];
 
         // allocate memory on the device
-        data_d[device_id] = device_ptr_type{ A.shape(), A.padding(), device };
-        q_red_d[device_id] = device_ptr_type{ q_red.size() + PADDING_SIZE, device };
+        data_d[device_id] = device_ptr_type{ A.shape(), A.padding(), device, use_usm_allocations };
+        q_red_d[device_id] = device_ptr_type{ q_red.size() + PADDING_SIZE, device, use_usm_allocations };
     }
 
     // pin the data matrix
@@ -296,7 +297,7 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
             case solver_type::cg_streaming:
                 {
                     // explicitly assemble the (potential partial) kernel matrix
-                    device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, solver == solver_type::cg_streaming, data_d[device_id], q_red_d[device_id], QA_cost);
+                    device_ptr_type kernel_matrix = this->run_assemble_kernel_matrix_explicit(device_id, exec, params, use_usm_allocations, data_d[device_id], q_red_d[device_id], QA_cost);
                     kernel_matrices_parts[device_id] = ::plssvm::detail::move_only_any{ std::move(kernel_matrix) };
                 }
                 break;
@@ -323,6 +324,7 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
     PLSSVM_ASSERT(B.shape() == C.shape(), "The B ({}) and C ({}) matrices must have the same shape!", B.shape(), C.shape());
     PLSSVM_ASSERT(B.padding() == C.padding(), "The B ({}) and C ({}) matrices must have the same padding!", B.padding(), C.padding());
 
+    const bool use_usm_allocations = solver == solver_type::cg_streaming;
     const std::size_t num_devices = this->num_available_devices();
 
     // the C and B matrices; completely stored on each device
@@ -332,7 +334,7 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
     // the partial C result from a specific device later stored on device 0 to perform the C reduction (inplace matrix addition)
     device_ptr_type partial_C_d{};
     if (num_devices > 1) {
-        partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0], solver == solver_type::cg_streaming };
+        partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0], use_usm_allocations };
     }
 
     // split memory allocation and memory copy!
@@ -345,8 +347,8 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
         const queue_type &device = devices_[device_id];
 
         // allocate memory on the device
-        B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device, solver == solver_type::cg_streaming };
-        C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device, solver == solver_type::cg_streaming };
+        B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device, use_usm_allocations };
+        C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device, use_usm_allocations };
     }
 
 #pragma omp parallel for ordered if (num_devices > 1)

From cd6deeadd8741f7c340687b69f507e5108e5f866 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 11:48:25 +0200
Subject: [PATCH 07/93] Add solver_type::automatic handling for cg_streaming.

---
 include/plssvm/csvm.hpp | 99 +++++++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 43 deletions(-)

diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp
index 723c4bfd9..1e23aec9f 100644
--- a/include/plssvm/csvm.hpp
+++ b/include/plssvm/csvm.hpp
@@ -46,6 +46,7 @@
 #include <cstddef>      // std::size_t
 #include <limits>       // std::numeric_limits::lowest
 #include <memory>       // std::unique_ptr
+#include <numeric>      // std::accumulate
 #include <optional>     // std::optional, std::make_optional, std::nullopt
 #include <ratio>        // std::milli
 #include <tuple>        // std::tie
@@ -791,6 +792,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
         // calculate the maximum total memory needed for the explicit and implicit kernel matrix per device
         const detail::triangular_data_distribution data_distribution{ num_rows_reduced, this->num_available_devices() };
         const std::vector<detail::memory_size> total_memory_needed_explicit_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs);
+        const detail::memory_size total_memory_needed_streaming = std::accumulate(total_memory_needed_explicit_per_device.cbegin(), total_memory_needed_explicit_per_device.cend(), detail::memory_size{});
         const std::vector<detail::memory_size> total_memory_needed_implicit_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs);
 
         // format a vector differentiating between it containing only a single entry or multiple
@@ -809,8 +811,9 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
                     "  - usable system memory (with safety margin of min({0} %, {1}): {3}\n"
                     "  - total device memory: {4}\n"
                     "  - usable device memory (with safety margin of min({0} %, {1}): {5}\n"
-                    "  - maximum memory needed (cg_explicit): {6}\n"
-                    "  - maximum memory needed (cg_implicit): {7}\n",
+                    "  - maximum device memory needed (cg_explicit): {6}\n"
+                    "  - maximum system memory needed (cg_streaming): {7}\n"
+                    "  - maximum device needed (cg_implicit): {8}\n",
                     static_cast<double>(percentual_safety_margin * 100.0L),
                     minimal_safety_margin,
                     detail::tracking::tracking_entry{ "solver", "system_memory", total_system_memory },
@@ -818,10 +821,12 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
                     format_vector(total_device_memory_per_device),
                     format_vector(usable_device_memory_per_device),
                     format_vector(total_memory_needed_explicit_per_device),
+                    total_memory_needed_streaming,
                     format_vector(total_memory_needed_implicit_per_device));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_memory", total_device_memory_per_device }));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "usable_device_memory_with_safety_margin", usable_device_memory_per_device }));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_device_memory_cg_explicit", total_memory_needed_explicit_per_device }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_system_memory_cg_streaming", total_memory_needed_streaming }));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_device_memory_cg_implicit", total_memory_needed_implicit_per_device }));
 
         // helper function to check whether ALL devices fulfill the requested memory constraint for the specific solver type
@@ -836,8 +841,6 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
             return failed_constraints;
         };
 
-        // TODO: LOGIC for cg_streaming!
-
         // select solver type based on the available memory
         // check whether the explicit (partial) kernel matrix can fit into each device memory
         if (const std::vector<std::size_t> failed_cg_explicit_constraints = check_sizes(total_memory_needed_explicit_per_device, usable_device_memory_per_device); failed_cg_explicit_constraints.empty()) {
@@ -846,55 +849,65 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
         } else {
             detail::log(verbosity_level::full, "Cannot use cg_explicit due to memory constraints on device(s) {}!\n", format_vector(failed_cg_explicit_constraints));
 
-            // check whether there is enough memory available for cg_implicit
-            if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) {
+            if (total_memory_needed_streaming <= usable_system_memory) {
                 // use the implicit solver type
-                used_solver = solver_type::cg_implicit;
+                used_solver = solver_type::cg_streaming;
             } else {
-                // not enough device memory available for the implicit case
-                throw kernel_launch_resources{ fmt::format("Not enough device memory available on device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) };
+                detail::log(verbosity_level::full, "Cannot use cg_streaming due to memory constraints on the system memory!\n");
+
+                // check whether there is enough memory available for cg_implicit
+                if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) {
+                    // use the implicit solver type
+                    used_solver = solver_type::cg_implicit;
+                } else {
+                    // not enough device memory available for the implicit case
+                    throw kernel_launch_resources{ fmt::format("Not enough device memory available on device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) };
+                }
             }
         }
 
         // enforce max mem alloc size if requested
 #if defined(PLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE)
-        // get the maximum possible memory allocation size per device
-        const std::vector<detail::memory_size> max_mem_alloc_size_per_device = this->get_max_mem_alloc_size();
+        // not applicable for the streaming CG implementation using USM!
+        if (used_solver != solver_type::cg_streaming) {
+            // get the maximum possible memory allocation size per device
+            const std::vector<detail::memory_size> max_mem_alloc_size_per_device = this->get_max_mem_alloc_size();
 
-        // get the maximum single allocation size per device
-        const std::vector<detail::memory_size> max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
-        const std::vector<detail::memory_size> max_single_allocation_cg_implicit_size_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
+            // get the maximum single allocation size per device
+            const std::vector<detail::memory_size> max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
+            const std::vector<detail::memory_size> max_single_allocation_cg_implicit_size_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
 
-        // output the maximum memory allocation size per device
-        detail::log(verbosity_level::full,
-                    "  - maximum supported single memory allocation size: {}\n"
-                    "  - maximum needed single memory allocation size (cg_explicit): {}\n"
-                    "  - maximum needed single memory allocation size (cg_implicit): {}\n",
-                    format_vector(max_mem_alloc_size_per_device),
-                    format_vector(max_single_allocation_cg_explicit_size_per_device),
-                    format_vector(max_single_allocation_cg_implicit_size_per_device));
-        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_single_mem_alloc_size", max_mem_alloc_size_per_device }));
-        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_explicit", max_single_allocation_cg_explicit_size_per_device }));
-        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_implicit", max_single_allocation_cg_implicit_size_per_device }));
-
-        // check whether the maximum single memory allocation sizes per device can be satisfied
-        // check whether the maximum single cg_explicit memory allocation size can be satisfied
-        if (const std::vector<std::size_t> failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device);
-            used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) {
-            // max mem alloc size constraints not fulfilled
+            // output the maximum memory allocation size per device
             detail::log(verbosity_level::full,
-                        "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n",
-                        format_vector(failed_cg_explicit_constraints));
-            // can't use cg_explicit
-            used_solver = solver_type::cg_implicit;
-        }
-        if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(max_single_allocation_cg_implicit_size_per_device, max_mem_alloc_size_per_device);
-            used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) {
-            // can't fulfill maximum single memory allocation size even for cg_implicit
-            plssvm::detail::log(verbosity_level::full | verbosity_level::warning,
-                                "WARNING: if you are sure that the guaranteed maximum memory allocation size can be safely ignored on your device, "
-                                "this check can be disabled via \"-DPLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=OFF\" during the CMake configuration!\n");
-            throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint for device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) };
+                        "  - maximum supported single memory allocation size: {}\n"
+                        "  - maximum needed single memory allocation size (cg_explicit): {}\n"
+                        "  - maximum needed single memory allocation size (cg_implicit): {}\n",
+                        format_vector(max_mem_alloc_size_per_device),
+                        format_vector(max_single_allocation_cg_explicit_size_per_device),
+                        format_vector(max_single_allocation_cg_implicit_size_per_device));
+            PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_single_mem_alloc_size", max_mem_alloc_size_per_device }));
+            PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_explicit", max_single_allocation_cg_explicit_size_per_device }));
+            PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_implicit", max_single_allocation_cg_implicit_size_per_device }));
+
+            // check whether the maximum single memory allocation sizes per device can be satisfied
+            // check whether the maximum single cg_explicit memory allocation size can be satisfied
+            if (const std::vector<std::size_t> failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device);
+                used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) {
+                // max mem alloc size constraints not fulfilled
+                detail::log(verbosity_level::full,
+                            "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n",
+                            format_vector(failed_cg_explicit_constraints));
+                // can't use cg_explicit
+                used_solver = solver_type::cg_implicit;
+            }
+            if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(max_single_allocation_cg_implicit_size_per_device, max_mem_alloc_size_per_device);
+                used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) {
+                // can't fulfill maximum single memory allocation size even for cg_implicit
+                plssvm::detail::log(verbosity_level::full | verbosity_level::warning,
+                                    "WARNING: if you are sure that the guaranteed maximum memory allocation size can be safely ignored on your device, "
+                                    "this check can be disabled via \"-DPLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=OFF\" during the CMake configuration!\n");
+                throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint for device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) };
+            }
         }
 #endif
     }

From 2dc78811355a393d12017ffd21f57dd5661cfbfc Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 14:42:51 +0200
Subject: [PATCH 08/93] Only use USM for the kernel matrix.

---
 include/plssvm/backends/CUDA/detail/device_ptr.cuh |  3 +++
 include/plssvm/backends/gpu_csvm.hpp               | 11 +++++------
 include/plssvm/backends/gpu_device_ptr.hpp         |  3 +++
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/plssvm/backends/CUDA/detail/device_ptr.cuh b/include/plssvm/backends/CUDA/detail/device_ptr.cuh
index bb99e1ffe..e361a8d1d 100644
--- a/include/plssvm/backends/CUDA/detail/device_ptr.cuh
+++ b/include/plssvm/backends/CUDA/detail/device_ptr.cuh
@@ -59,6 +59,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
      * @brief Allocates `size * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] device the associated CUDA device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
     device_ptr(size_type size, queue_type device, bool use_usm_allocations = false);
@@ -66,6 +67,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] device the associated CUDA device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
     device_ptr(plssvm::shape shape, queue_type device, bool use_usm_allocations = false);
@@ -74,6 +76,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] device the associated CUDA device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
     device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device, bool use_usm_allocations = false);
diff --git a/include/plssvm/backends/gpu_csvm.hpp b/include/plssvm/backends/gpu_csvm.hpp
index 089d8cbb0..d0fb90182 100644
--- a/include/plssvm/backends/gpu_csvm.hpp
+++ b/include/plssvm/backends/gpu_csvm.hpp
@@ -254,8 +254,8 @@ std::vector<::plssvm::detail::move_only_any> gpu_csvm<device_ptr_t, queue_t, pin
         const queue_type &device = devices_[device_id];
 
         // allocate memory on the device
-        data_d[device_id] = device_ptr_type{ A.shape(), A.padding(), device, use_usm_allocations };
-        q_red_d[device_id] = device_ptr_type{ q_red.size() + PADDING_SIZE, device, use_usm_allocations };
+        data_d[device_id] = device_ptr_type{ A.shape(), A.padding(), device };
+        q_red_d[device_id] = device_ptr_type{ q_red.size() + PADDING_SIZE, device };
     }
 
     // pin the data matrix
@@ -324,7 +324,6 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
     PLSSVM_ASSERT(B.shape() == C.shape(), "The B ({}) and C ({}) matrices must have the same shape!", B.shape(), C.shape());
     PLSSVM_ASSERT(B.padding() == C.padding(), "The B ({}) and C ({}) matrices must have the same padding!", B.padding(), C.padding());
 
-    const bool use_usm_allocations = solver == solver_type::cg_streaming;
     const std::size_t num_devices = this->num_available_devices();
 
     // the C and B matrices; completely stored on each device
@@ -334,7 +333,7 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
     // the partial C result from a specific device later stored on device 0 to perform the C reduction (inplace matrix addition)
     device_ptr_type partial_C_d{};
     if (num_devices > 1) {
-        partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0], use_usm_allocations };
+        partial_C_d = device_ptr_type{ C.shape(), C.padding(), devices_[0] };
     }
 
     // split memory allocation and memory copy!
@@ -347,8 +346,8 @@ void gpu_csvm<device_ptr_t, queue_t, pinned_memory_t>::blas_level_3(const solver
         const queue_type &device = devices_[device_id];
 
         // allocate memory on the device
-        B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device, use_usm_allocations };
-        C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device, use_usm_allocations };
+        B_d[device_id] = device_ptr_type{ B.shape(), B.padding(), device };
+        C_d[device_id] = device_ptr_type{ C.shape(), C.padding(), device };
     }
 
 #pragma omp parallel for ordered if (num_devices > 1)
diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp
index b5d396051..c4a277e06 100644
--- a/include/plssvm/backends/gpu_device_ptr.hpp
+++ b/include/plssvm/backends/gpu_device_ptr.hpp
@@ -56,6 +56,7 @@ class gpu_device_ptr {
      * @brief Construct a device_ptr for the device managed by @p queue with the extents { @p size, 1 }.
      * @param[in] size the size of the managed memory
      * @param[in] queue the queue (or similar) to manage the device_ptr
+     * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend
      */
     gpu_device_ptr(size_type size, const queue_type queue, bool use_usm_allocations);
     /**
@@ -63,6 +64,7 @@ class gpu_device_ptr {
      * @details The managed memory size is: extents[0] * extents[1].
      * @param[in] shape the 2D size of the managed memory; size = shape.x * shape.y
      * @param[in] queue the queue (or similar) to manage the device_ptr
+     * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend
      */
     gpu_device_ptr(plssvm::shape shape, const queue_type queue, bool use_usm_allocations);
     /**
@@ -71,6 +73,7 @@ class gpu_device_ptr {
      * @param[in] shape the extents of the managed memory
      * @param[in] padding the padding applied to the extents
      * @param[in] queue the queue (or similar) to manage the device_ptr
+     * @param[in] use_usm_allocations if `true` use USM allocations in the respective backend
      */
     gpu_device_ptr(plssvm::shape shape, plssvm::shape padding, const queue_type queue, bool use_usm_allocations);
 

From 55ad7211f6abf8b73e2e0a92a355eecaf9f05b8c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 14:44:58 +0200
Subject: [PATCH 09/93] Improve automatic solver_type handling.

---
 include/plssvm/csvm.hpp                     | 112 ++++++++++++--------
 include/plssvm/detail/data_distribution.hpp |  17 +++
 src/plssvm/detail/data_distribution.cpp     |  46 ++++++++
 3 files changed, 128 insertions(+), 47 deletions(-)

diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp
index 1e23aec9f..098111397 100644
--- a/include/plssvm/csvm.hpp
+++ b/include/plssvm/csvm.hpp
@@ -792,7 +792,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
         // calculate the maximum total memory needed for the explicit and implicit kernel matrix per device
         const detail::triangular_data_distribution data_distribution{ num_rows_reduced, this->num_available_devices() };
         const std::vector<detail::memory_size> total_memory_needed_explicit_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs);
-        const detail::memory_size total_memory_needed_streaming = std::accumulate(total_memory_needed_explicit_per_device.cbegin(), total_memory_needed_explicit_per_device.cend(), detail::memory_size{});
+        const std::pair<detail::memory_size, std::vector<detail::memory_size>> total_memory_needed_streaming_per_device = data_distribution.calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(num_features, num_rhs);
         const std::vector<detail::memory_size> total_memory_needed_implicit_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_needed_per_place(num_features, num_rhs);
 
         // format a vector differentiating between it containing only a single entry or multiple
@@ -811,9 +811,9 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
                     "  - usable system memory (with safety margin of min({0} %, {1}): {3}\n"
                     "  - total device memory: {4}\n"
                     "  - usable device memory (with safety margin of min({0} %, {1}): {5}\n"
-                    "  - maximum device memory needed (cg_explicit): {6}\n"
-                    "  - maximum system memory needed (cg_streaming): {7}\n"
-                    "  - maximum device needed (cg_implicit): {8}\n",
+                    "  - maximum memory needed (cg_explicit): {6}\n"
+                    "  - maximum memory needed (cg_streaming): {7} (device) + {8} (system total)\n"
+                    "  - maximum memory needed (cg_implicit): {9}\n",
                     static_cast<double>(percentual_safety_margin * 100.0L),
                     minimal_safety_margin,
                     detail::tracking::tracking_entry{ "solver", "system_memory", total_system_memory },
@@ -821,13 +821,15 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
                     format_vector(total_device_memory_per_device),
                     format_vector(usable_device_memory_per_device),
                     format_vector(total_memory_needed_explicit_per_device),
-                    total_memory_needed_streaming,
+                    format_vector(total_memory_needed_streaming_per_device.second),
+                    total_memory_needed_streaming_per_device.first,
                     format_vector(total_memory_needed_implicit_per_device));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_memory", total_device_memory_per_device }));
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "usable_device_memory_with_safety_margin", usable_device_memory_per_device }));
-        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_device_memory_cg_explicit", total_memory_needed_explicit_per_device }));
-        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_system_memory_cg_streaming", total_memory_needed_streaming }));
-        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_device_memory_cg_implicit", total_memory_needed_implicit_per_device }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_memory_cg_explicit", total_memory_needed_explicit_per_device }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_system_memory_cg_streaming", total_memory_needed_streaming_per_device.first }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_device_memory_cg_streaming", total_memory_needed_streaming_per_device.second }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "needed_memory_cg_implicit", total_memory_needed_implicit_per_device }));
 
         // helper function to check whether ALL devices fulfill the requested memory constraint for the specific solver type
         const auto check_sizes = [](const auto &needed_memory_per_device, const auto &memory_constraint) -> std::vector<std::size_t> {
@@ -849,11 +851,17 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
         } else {
             detail::log(verbosity_level::full, "Cannot use cg_explicit due to memory constraints on device(s) {}!\n", format_vector(failed_cg_explicit_constraints));
 
-            if (total_memory_needed_streaming <= usable_system_memory) {
+            if (const std::vector<std::size_t> failed_cg_streaming_constraints = check_sizes(total_memory_needed_streaming_per_device.second, usable_device_memory_per_device);
+                total_memory_needed_streaming_per_device.first <= usable_system_memory && failed_cg_streaming_constraints.empty()) {
                 // use the implicit solver type
                 used_solver = solver_type::cg_streaming;
             } else {
-                detail::log(verbosity_level::full, "Cannot use cg_streaming due to memory constraints on the system memory!\n");
+                if (!failed_cg_streaming_constraints.empty()) {
+                    detail::log(verbosity_level::full, "Cannot use cg_streaming due to memory constraints on device(s) {}!\n", format_vector(failed_cg_streaming_constraints));
+                }
+                if (total_memory_needed_streaming_per_device.first > usable_system_memory) {
+                    detail::log(verbosity_level::full, "Cannot use cg_streaming due to system memory constraints!\n");
+                }
 
                 // check whether there is enough memory available for cg_implicit
                 if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) {
@@ -868,46 +876,56 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
 
         // enforce max mem alloc size if requested
 #if defined(PLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE)
-        // not applicable for the streaming CG implementation using USM!
-        if (used_solver != solver_type::cg_streaming) {
-            // get the maximum possible memory allocation size per device
-            const std::vector<detail::memory_size> max_mem_alloc_size_per_device = this->get_max_mem_alloc_size();
+        // get the maximum possible memory allocation size per device
+        const std::vector<detail::memory_size> max_mem_alloc_size_per_device = this->get_max_mem_alloc_size();
 
-            // get the maximum single allocation size per device
-            const std::vector<detail::memory_size> max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
-            const std::vector<detail::memory_size> max_single_allocation_cg_implicit_size_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
+        // get the maximum single allocation size per device
+        const std::vector<detail::memory_size> max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
+        const std::vector<detail::memory_size> max_single_allocation_cg_streaming_size_per_device = data_distribution.calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
+        const std::vector<detail::memory_size> max_single_allocation_cg_implicit_size_per_device = data_distribution.calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
 
-            // output the maximum memory allocation size per device
+        // output the maximum memory allocation size per device
+        detail::log(verbosity_level::full,
+                    "  - maximum supported single memory allocation size: {}\n"
+                    "  - maximum needed single memory allocation size (cg_explicit): {}\n"
+                    "  - maximum needed single memory allocation size (cg_streaming): {}\n"
+                    "  - maximum needed single memory allocation size (cg_implicit): {}\n",
+                    format_vector(max_mem_alloc_size_per_device),
+                    format_vector(max_single_allocation_cg_explicit_size_per_device),
+                    format_vector(max_single_allocation_cg_streaming_size_per_device),
+                    format_vector(max_single_allocation_cg_implicit_size_per_device));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_single_mem_alloc_size", max_mem_alloc_size_per_device }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_explicit", max_single_allocation_cg_explicit_size_per_device }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_streaming", max_single_allocation_cg_streaming_size_per_device }));
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_implicit", max_single_allocation_cg_implicit_size_per_device }));
+
+        // check whether the maximum single memory allocation sizes per device can be satisfied
+        // check whether the maximum single cg_explicit memory allocation size can be satisfied
+        if (const std::vector<std::size_t> failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device);
+            used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) {
+            // max mem alloc size constraints not fulfilled
             detail::log(verbosity_level::full,
-                        "  - maximum supported single memory allocation size: {}\n"
-                        "  - maximum needed single memory allocation size (cg_explicit): {}\n"
-                        "  - maximum needed single memory allocation size (cg_implicit): {}\n",
-                        format_vector(max_mem_alloc_size_per_device),
-                        format_vector(max_single_allocation_cg_explicit_size_per_device),
-                        format_vector(max_single_allocation_cg_implicit_size_per_device));
-            PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_single_mem_alloc_size", max_mem_alloc_size_per_device }));
-            PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_explicit", max_single_allocation_cg_explicit_size_per_device }));
-            PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "solver", "device_max_mem_alloc_size_cg_implicit", max_single_allocation_cg_implicit_size_per_device }));
-
-            // check whether the maximum single memory allocation sizes per device can be satisfied
-            // check whether the maximum single cg_explicit memory allocation size can be satisfied
-            if (const std::vector<std::size_t> failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device);
-                used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) {
-                // max mem alloc size constraints not fulfilled
-                detail::log(verbosity_level::full,
-                            "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n",
-                            format_vector(failed_cg_explicit_constraints));
-                // can't use cg_explicit
-                used_solver = solver_type::cg_implicit;
-            }
-            if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(max_single_allocation_cg_implicit_size_per_device, max_mem_alloc_size_per_device);
-                used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) {
-                // can't fulfill maximum single memory allocation size even for cg_implicit
-                plssvm::detail::log(verbosity_level::full | verbosity_level::warning,
-                                    "WARNING: if you are sure that the guaranteed maximum memory allocation size can be safely ignored on your device, "
-                                    "this check can be disabled via \"-DPLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=OFF\" during the CMake configuration!\n");
-                throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint for device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) };
-            }
+                        "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_streaming.\n",
+                        format_vector(failed_cg_explicit_constraints));
+            // can't use cg_explicit
+            used_solver = solver_type::cg_streaming;
+        }
+        if (const std::vector<std::size_t> failed_cg_streaming_constraints = check_sizes(max_single_allocation_cg_streaming_size_per_device, max_mem_alloc_size_per_device);
+            used_solver == solver_type::cg_streaming && !failed_cg_streaming_constraints.empty()) {
+            // max mem alloc size constraints not fulfilled
+            detail::log(verbosity_level::full,
+                        "Cannot use cg_streaming due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n",
+                        format_vector(failed_cg_streaming_constraints));
+            // can't use cg_streaming
+            used_solver = solver_type::cg_implicit;
+        }
+        if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(max_single_allocation_cg_implicit_size_per_device, max_mem_alloc_size_per_device);
+            used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) {
+            // can't fulfill maximum single memory allocation size even for cg_implicit
+            plssvm::detail::log(verbosity_level::full | verbosity_level::warning,
+                                "WARNING: if you are sure that the guaranteed maximum memory allocation size can be safely ignored on your device, "
+                                "this check can be disabled via \"-DPLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=OFF\" during the CMake configuration!\n");
+            throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint for device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) };
         }
 #endif
     }
diff --git a/include/plssvm/detail/data_distribution.hpp b/include/plssvm/detail/data_distribution.hpp
index 0d4acd5ac..c7968108a 100644
--- a/include/plssvm/detail/data_distribution.hpp
+++ b/include/plssvm/detail/data_distribution.hpp
@@ -20,6 +20,7 @@
 
 #include <cstddef>  // std::size_t
 #include <iosfwd>   // std::ostream forward declaration
+#include <utility>  // std::pair
 #include <vector>   // std::vector
 
 namespace plssvm::detail {
@@ -164,6 +165,22 @@ class triangular_data_distribution : public data_distribution {
      */
     [[nodiscard]] std::vector<memory_size> calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(std::size_t num_features, std::size_t num_classes) const;
 
+    /**
+     * @brief Calculate the theoretical total memory needed per place for assembling the kernel matrix using USM.
+     * @param[in] num_features the total number of features
+     * @param[in] num_classes the total number of classes
+     * @return the theoretical total memory needed per place for cg_streaming (`[[nodiscard]]`)
+     */
+    [[nodiscard]] std::pair<memory_size, std::vector<memory_size>> calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(std::size_t num_features, std::size_t num_classes) const;
+
+    /**
+     * @brief Calculate the theoretical maximum single memory allocation size per place for assembling the kernel matrix using USM.
+     * @param[in] num_features the total number of features
+     * @param[in] num_classes the total number of classes
+     * @return the theoretical maximum single memory allocation size per place for cg_streaming (`[[nodiscard]]`)
+     */
+    [[nodiscard]] std::vector<memory_size> calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(std::size_t num_features, std::size_t num_classes) const;
+
     /**
      * @brief Calculate the theoretical total memory needed per place for implicitly assembling the kernel matrix.
      * @param[in] num_features the total number of features
diff --git a/src/plssvm/detail/data_distribution.cpp b/src/plssvm/detail/data_distribution.cpp
index dc979761e..db326fa59 100644
--- a/src/plssvm/detail/data_distribution.cpp
+++ b/src/plssvm/detail/data_distribution.cpp
@@ -18,6 +18,7 @@
 #include <algorithm>  // std::max, std::fill
 #include <cstddef>    // std::size_t
 #include <ostream>    // std::ostream
+#include <utility>    // std::pair, std::make_pair
 #include <vector>     // std::vector
 
 [[nodiscard]] std::size_t calculate_data_set_num_entries(const std::size_t num_data_points, const std::size_t num_features) noexcept {
@@ -170,6 +171,51 @@ std::vector<memory_size> triangular_data_distribution::calculate_maximum_explici
     return res;
 }
 
+std::pair<memory_size, std::vector<memory_size>> triangular_data_distribution::calculate_maximum_streaming_kernel_matrix_memory_needed_per_place(const std::size_t num_features, const std::size_t num_classes) const {
+    PLSSVM_ASSERT(num_features > 0, "At least one feature must be present!");
+    PLSSVM_ASSERT(num_classes > 0, "At least two classes must be present!");
+
+    const std::size_t num_places = this->num_places();
+    const std::size_t num_rows = this->num_rows() + 1;  // account for dimensional reduction
+    // first: system memory
+    // second: device memory
+    std::pair<memory_size, std::vector<memory_size>> res = std::make_pair(0_B, std::vector<memory_size>(num_places, 0_B));
+
+    for (std::size_t device_id = 0; device_id < num_places; ++device_id) {
+        // check whether the current device is responsible for at least one data point!
+        if (this->place_specific_num_rows(device_id) == 0) {
+            continue;
+        }
+
+        // data set including padding
+        const std::size_t data_set_size = ::calculate_data_set_num_entries(num_rows, num_features);
+
+        // the size of q_red
+        const std::size_t q_red_size = ::calculate_q_red_num_entries(num_rows);
+
+        // the size of the explicitly stored kernel matrix
+        const std::size_t kernel_matrix_size{ this->calculate_explicit_kernel_matrix_num_entries_padded(device_id) };
+
+        // the B and C matrices for the explicit SYMM kernel
+        std::size_t blas_matrices_size = 2 * ::calculate_blas_matrix_entries(num_rows, num_classes);
+        if (device_id == 0 && num_places > 1) {
+            // device 0 has to save an additional matrix used to accumulate the partial results from the other devices
+            blas_matrices_size += ::calculate_blas_matrix_entries(num_rows, num_classes);
+        }
+
+        // add up the individual sizes and report the memory size in BYTES
+        // for streaming, the kernel matrix is on the host, while everything else is on the device
+        res.first += memory_size{ sizeof(real_type) * kernel_matrix_size };
+        res.second[device_id] = memory_size{ sizeof(real_type) * (q_red_size + std::max(data_set_size, blas_matrices_size)) };
+    }
+
+    return res;
+}
+
+std::vector<memory_size> triangular_data_distribution::calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(const std::size_t num_features, const std::size_t num_classes) const {
+    return this->calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_classes);
+}
+
 std::vector<memory_size> triangular_data_distribution::calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(const std::size_t num_features, const std::size_t num_classes) const {
     PLSSVM_ASSERT(num_features > 0, "At least one feature must be present!");
     PLSSVM_ASSERT(num_classes > 0, "At least two classes must be present!");

From dad3561fa3eb10600a2ca36ecd979a56a9245ee1 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 14:50:51 +0200
Subject: [PATCH 10/93] Implement cg_streaming via USM allocations in SYCL.

---
 .../plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp  |  2 +-
 .../SYCL/AdaptiveCpp/detail/device_ptr.hpp     | 10 +++++++---
 include/plssvm/backends/SYCL/DPCPP/csvm.hpp    |  2 +-
 .../backends/SYCL/DPCPP/detail/device_ptr.hpp  | 10 +++++++---
 src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp  |  6 ++++--
 .../SYCL/AdaptiveCpp/detail/device_ptr.cpp     | 18 +++++++++++-------
 src/plssvm/backends/SYCL/DPCPP/csvm.cpp        |  6 ++++--
 .../backends/SYCL/DPCPP/detail/device_ptr.cpp  | 18 +++++++++++-------
 8 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
index 131116260..121e891db 100644
--- a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
+++ b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
@@ -168,7 +168,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp
index e9096f984..dabdc4071 100644
--- a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp
+++ b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp
@@ -33,6 +33,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,21 +59,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
      * @brief Allocates `size * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(size_type size, const queue &q);
+    device_ptr(size_type size, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, const queue &q);
+    device_ptr(plssvm::shape shape, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
index 4e21675a9..5b70d6aba 100644
--- a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
+++ b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
@@ -168,7 +168,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp b/include/plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp
index 96cb4cf63..4dcf86926 100644
--- a/include/plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp
+++ b/include/plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp
@@ -33,6 +33,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,21 +59,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, queue, T *, device
      * @brief Allocates `size * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(size_type size, const queue &q);
+    device_ptr(size_type size, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, const queue &q);
+    device_ptr(plssvm::shape shape, const queue &q, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device associated with @p q.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] q the associated SYCL queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const queue &q, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
index 167a40519..85336e179 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
@@ -203,7 +203,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::s
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const std::size_t num_rows_reduced = data_d.shape().x - 1;
     const std::size_t num_features = data_d.shape().y;
     const queue_type &device = devices_[device_id];
@@ -218,7 +218,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     // convert execution range block to SYCL's native range<2>
diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
index 0338d10c9..2c571e591 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
@@ -26,17 +26,21 @@
 namespace plssvm::adaptivecpp::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const queue &q) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q } { }
+device_ptr<T>::device_ptr(const size_type size, const queue &q, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const queue &q) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, q } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const queue &q, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue &q) :
-    base_type{ shape, padding, q } {
-    data_ = ::sycl::malloc_device<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue &q, const bool use_usm_allocations) :
+    base_type{ shape, padding, q, use_usm_allocations } {
+    if (use_usm_allocations_) {
+        data_ = ::sycl::malloc_shared<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+    } else {
+        data_ = ::sycl::malloc_device<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+    }
     this->memset(0);
 }
 
diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
index 4d626174b..5687d42ce 100644
--- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
@@ -191,7 +191,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id)
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const std::size_t num_rows_reduced = data_d.shape().x - 1;
     const std::size_t num_features = data_d.shape().y;
     const queue_type &device = devices_[device_id];
@@ -206,7 +206,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     // convert execution range block to SYCL's native range<2>
diff --git a/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp b/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp
index c24b84407..456102d02 100644
--- a/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/detail/device_ptr.cpp
@@ -23,17 +23,21 @@
 namespace plssvm::dpcpp::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const queue &q) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q } { }
+device_ptr<T>::device_ptr(const size_type size, const queue &q, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const queue &q) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, q } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const queue &q, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, q, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, plssvm::shape padding, const queue &q) :
-    base_type{ shape, padding, q } {
-    data_ = ::sycl::malloc_device<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue &q, const bool use_usm_allocations) :
+    base_type{ shape, padding, q, use_usm_allocations } {
+    if (use_usm_allocations_) {
+        data_ = ::sycl::malloc_shared<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+    } else {
+        data_ = ::sycl::malloc_device<value_type>(this->size_padded(), queue_.impl->sycl_queue);
+    }
     this->memset(0);
 }
 

From f29c792ae8035a09149f420266107a7eb4549a3f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 14:54:53 +0200
Subject: [PATCH 11/93] Implement cg_streaming via USM allocations in HIP.

---
 include/plssvm/backends/HIP/csvm.hpp           |  2 +-
 .../backends/HIP/detail/device_ptr.hip.hpp     | 12 ++++++++----
 src/plssvm/backends/HIP/csvm.hip               |  6 ++++--
 src/plssvm/backends/HIP/detail/device_ptr.hip  | 18 +++++++++++-------
 4 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/include/plssvm/backends/HIP/csvm.hpp b/include/plssvm/backends/HIP/csvm.hpp
index 12aea214d..d50b948e0 100644
--- a/include/plssvm/backends/HIP/csvm.hpp
+++ b/include/plssvm/backends/HIP/csvm.hpp
@@ -156,7 +156,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/HIP/detail/device_ptr.hip.hpp b/include/plssvm/backends/HIP/detail/device_ptr.hip.hpp
index 26490d4ac..1e4462169 100644
--- a/include/plssvm/backends/HIP/detail/device_ptr.hip.hpp
+++ b/include/plssvm/backends/HIP/detail/device_ptr.hip.hpp
@@ -32,6 +32,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -58,24 +59,27 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, int, T *, device_p
      * @brief Allocates `size * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] device the associated HIP device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::hip::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(size_type size, queue_type device);
+    explicit device_ptr(size_type size, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] device the associated HIP device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::hip::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    explicit device_ptr(plssvm::shape shape, queue_type device);
+    explicit device_ptr(plssvm::shape shape, queue_type device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device with ID @p device.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
-     * @param[in] device the associated CUDA device
+     * @param[in] device the associated HIP device
+     * @param[in] use_usm_allocations if `true` use USM allocations
      * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, queue_type device, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/src/plssvm/backends/HIP/csvm.hip b/src/plssvm/backends/HIP/csvm.hip
index 4108cf95d..c982b8039 100644
--- a/src/plssvm/backends/HIP/csvm.hip
+++ b/src/plssvm/backends/HIP/csvm.hip
@@ -165,7 +165,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const unsigned long long num_rows_reduced = data_d.shape().x - 1;
     const unsigned long long num_features = data_d.shape().y;
     const queue_type &device = devices_[device_id];
@@ -180,7 +180,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     // convert execution range block to HIP's native dim3
diff --git a/src/plssvm/backends/HIP/detail/device_ptr.hip b/src/plssvm/backends/HIP/detail/device_ptr.hip
index 560783097..c958c73fd 100644
--- a/src/plssvm/backends/HIP/detail/device_ptr.hip
+++ b/src/plssvm/backends/HIP/detail/device_ptr.hip
@@ -29,21 +29,25 @@
 namespace plssvm::hip::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const queue_type device) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const size_type size, const queue_type device, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const queue_type device) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const queue_type device, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device) :
-    base_type{ shape, padding, device } {
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const queue_type device, const bool use_usm_allocations) :
+    base_type{ shape, padding, device, use_usm_allocations } {
     if (queue_ < 0 || queue_ >= static_cast<int>(get_device_count())) {
         throw backend_exception{ fmt::format("Illegal device ID! Must be in range: [0, {}) but is {}.", get_device_count(), queue_) };
     }
     detail::set_device(queue_);
-    PLSSVM_HIP_ERROR_CHECK(hipMalloc(&data_, this->size_padded() * sizeof(value_type)))
+    if (use_usm_allocations_) {
+        PLSSVM_HIP_ERROR_CHECK(hipMallocManaged(&data_, this->size_padded() * sizeof(value_type)))
+    } else {
+        PLSSVM_HIP_ERROR_CHECK(hipMalloc(&data_, this->size_padded() * sizeof(value_type)))
+    }
     this->memset(0);
 }
 

From c53ea4252c848f9fc90f7964ce96917c42ada829 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 14:59:35 +0200
Subject: [PATCH 12/93] For OpenMP and stdpar, cg_streaming is equal to
 cg_explicit.

---
 src/plssvm/backends/OpenMP/csvm.cpp | 2 ++
 src/plssvm/backends/stdpar/csvm.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp
index 938c4c843..526257278 100644
--- a/src/plssvm/backends/OpenMP/csvm.cpp
+++ b/src/plssvm/backends/OpenMP/csvm.cpp
@@ -100,6 +100,7 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
             // unreachable
             break;
         case solver_type::cg_explicit:
+        case solver_type::cg_streaming:
             {
                 const plssvm::detail::triangular_data_distribution dist{ A.num_rows() - 1, this->num_available_devices() };
                 std::vector<real_type> kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));  // only explicitly store the upper triangular matrix
@@ -153,6 +154,7 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
             // unreachable
             break;
         case solver_type::cg_explicit:
+        case solver_type::cg_streaming:
             {
                 const std::size_t num_rhs = B.shape().x;
                 const std::size_t num_rows = B.shape().y;
diff --git a/src/plssvm/backends/stdpar/csvm.cpp b/src/plssvm/backends/stdpar/csvm.cpp
index 1df113531..841fcaa34 100644
--- a/src/plssvm/backends/stdpar/csvm.cpp
+++ b/src/plssvm/backends/stdpar/csvm.cpp
@@ -68,6 +68,7 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
             // unreachable
             break;
         case solver_type::cg_explicit:
+        case solver_type::cg_streaming:
             {
                 const plssvm::detail::triangular_data_distribution dist{ A.num_rows() - 1, this->num_available_devices() };
                 std::vector<real_type> kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));  // only explicitly store the upper triangular matrix
@@ -121,6 +122,7 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
             // unreachable
             break;
         case solver_type::cg_explicit:
+        case solver_type::cg_streaming:
             {
                 const std::size_t num_rhs = B.shape().x;
                 const std::size_t num_rows = B.shape().y;

From f41aa355f9c0f31d92ec74878b20e602849d5cd4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 16:20:51 +0200
Subject: [PATCH 13/93] Implement cg_streaming via USM allocations in OpenCL
 (using some ugly workarounds).

---
 include/plssvm/backends/OpenCL/csvm.hpp       |   2 +-
 .../backends/OpenCL/detail/device_ptr.hpp     |  28 +++-
 .../plssvm/backends/OpenCL/detail/utility.hpp |  17 ++-
 include/plssvm/detail/type_traits.hpp         |  20 +++
 include/plssvm/detail/utility.hpp             |  15 +++
 src/plssvm/backends/OpenCL/csvm.cpp           |  52 ++++----
 .../backends/OpenCL/detail/device_ptr.cpp     | 124 +++++++++++++-----
 7 files changed, 195 insertions(+), 63 deletions(-)

diff --git a/include/plssvm/backends/OpenCL/csvm.hpp b/include/plssvm/backends/OpenCL/csvm.hpp
index 11d57c424..460f8d54e 100644
--- a/include/plssvm/backends/OpenCL/csvm.hpp
+++ b/include/plssvm/backends/OpenCL/csvm.hpp
@@ -155,7 +155,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::comma
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp
index 348faf5f6..a88f4ce0f 100644
--- a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp
+++ b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp
@@ -20,6 +20,7 @@
 #include "CL/cl.h"  // cl_mem
 
 #include <cstddef>  // std::size_t
+#include <variant>  // std::variant
 
 namespace plssvm::opencl::detail {
 
@@ -35,6 +36,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queu
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -60,21 +62,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queu
      * @brief Allocates `size * sizeof(T)` bytes on the device associated with @p queue.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] queue the associated command queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(size_type size, const command_queue &queue);
+    device_ptr(size_type size, const command_queue &queue, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes on the device associated with @p queue.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] queue the associated command queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, const command_queue &queue);
+    device_ptr(plssvm::shape shape, const command_queue &queue, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes on the device associated with @p queue.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] queue the associated command queue
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const command_queue &queue);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const command_queue &queue, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
@@ -99,6 +104,19 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queu
      */
     ~device_ptr() override;
 
+    /**
+     * @brief Get a pointer to the device memory.
+     * @details If USM allocations are used, returns a `T*` otherwise returns a `cl_mem` object.
+     * @return a variant containing the device memory pointer (`[[nodiscard]]`)
+     */
+    [[nodiscard]] std::variant<device_pointer_type, T *> get_variant();
+    /**
+     * @brief Get a pointer to the device memory.
+     * @details If USM allocations are used, returns a `T*` otherwise returns a `cl_mem` object.
+     * @return a variant containing the device memory pointer (`[[nodiscard]]`)
+     */
+    [[nodiscard]] std::variant<device_pointer_type, T *> get_variant() const;
+
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::memset(int, size_type, size_type)
      */
@@ -123,6 +141,10 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queu
      * @copydoc plssvm::detail::gpu_device_ptr::copy_to_other_device(derived_gpu_device_ptr &, size_type, size_type) const
      */
     void copy_to_other_device(device_ptr &target, size_type pos, size_type count) const override;
+
+  private:
+    /// The USM pointer used if `use_usm_allocations_` is `true`.
+    T *usm_ptr_{ nullptr };
 };
 
 extern template class device_ptr<float>;
diff --git a/include/plssvm/backends/OpenCL/detail/utility.hpp b/include/plssvm/backends/OpenCL/detail/utility.hpp
index 5e58435f3..780ce6ba1 100644
--- a/include/plssvm/backends/OpenCL/detail/utility.hpp
+++ b/include/plssvm/backends/OpenCL/detail/utility.hpp
@@ -20,10 +20,12 @@
 #include "plssvm/backends/OpenCL/detail/kernel.hpp"         // plssvm::opencl::detail::compute_kernel_name
 #include "plssvm/backends/OpenCL/exceptions.hpp"            // plssvm::opencl::backend_exception
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
+#include "plssvm/detail/type_list.hpp"                      // plssvm::detail::{remove_cvref_t, is_variant_v}
+#include "plssvm/detail/utility.hpp"                        // plssvm::detail::visit_overload
 #include "plssvm/kernel_function_types.hpp"                 // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                      // plssvm::target_platform
 
-#include "CL/cl.h"  // cl_uint, cl_int, clSetKernelArg, clEnqueueNDRangeKernel, clFinish
+#include "CL/cl.h"  // cl_uint, cl_int, clSetKernelArg, clSetKernelArgSVMPointer, clEnqueueNDRangeKernel, clFinish
 
 #include "fmt/format.h"  // fmt::format
 
@@ -31,6 +33,7 @@
 #include <string>       // std::string
 #include <string_view>  // std::string_view
 #include <utility>      // std::forward, std::pair
+#include <variant>      // std::variant, std::visit
 #include <vector>       // std::vector
 
 /**
@@ -141,7 +144,17 @@ inline void set_kernel_args(cl_kernel kernel, Args... args) {
     cl_uint i = 0;
     // iterate over parameter pack and set OpenCL kernel
     ([&](auto &arg) {
-        const error_code ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg);
+        error_code ec{};
+        // check if we have to set a variant value
+        if constexpr (::plssvm::detail::is_variant_v<::plssvm::detail::remove_cvref_t<decltype(arg)>>) {
+            std::visit(::plssvm::detail::visit_overload{
+                           [&](cl_mem &kernel_arg) { ec = clSetKernelArg(kernel, i++, sizeof(decltype(kernel_arg)), &kernel_arg); },
+                           [&](auto &kernel_arg) { ec = clSetKernelArgSVMPointer(kernel, i++, kernel_arg); } },
+                       arg);
+        } else {
+            // set kernel argument normally
+            ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg);
+        }
         PLSSVM_OPENCL_ERROR_CHECK(ec, fmt::format("error setting OpenCL kernel argument {}", i - 1))
     }(args),
      ...);
diff --git a/include/plssvm/detail/type_traits.hpp b/include/plssvm/detail/type_traits.hpp
index 0ad95542d..effa4f556 100644
--- a/include/plssvm/detail/type_traits.hpp
+++ b/include/plssvm/detail/type_traits.hpp
@@ -24,6 +24,7 @@
 #include <type_traits>    // std::enable_if_t, std::remove_cv_t, std::remove_reference_t, std::false_type, std::true_type
 #include <unordered_map>  // std::unordered_map, std::unordered_multimap
 #include <unordered_set>  // std::unordered_set, std::unordered_multiset
+#include <variant>        // std::variant
 #include <vector>         // std::vector
 
 namespace plssvm::detail {
@@ -342,6 +343,25 @@ constexpr bool is_unordered_associative_container_v = is_unordered_set_v<T> || i
 template <typename T>
 constexpr bool is_container_v = is_sequence_container_v<T> || is_associative_container_v<T> || is_unordered_associative_container_v<T>;
 
+/**
+ * @brief Type trait to check whether @p T is a `std::variant`.
+ * @tparam T the type to check
+ */
+template <typename T>
+struct is_variant : std::false_type { };
+
+/**
+ * @copybrief plssvm::detail::is_variant
+ */
+template <typename... Args>
+struct is_variant<std::variant<Args...>> : std::true_type { };
+
+/**
+ * @copybrief plssvm::detail::is_variant
+ */
+template <typename T>
+constexpr bool is_variant_v = is_variant<T>::value;
+
 }  // namespace plssvm::detail
 
 #endif  // PLSSVM_DETAIL_TYPE_TRAITS_HPP_
diff --git a/include/plssvm/detail/utility.hpp b/include/plssvm/detail/utility.hpp
index e81d46ae1..613a571cc 100644
--- a/include/plssvm/detail/utility.hpp
+++ b/include/plssvm/detail/utility.hpp
@@ -50,6 +50,21 @@
 
 namespace plssvm::detail {
 
+/**
+ * @brief Shorthand for a more readable `std::visit` overload set.
+ * @tparam Ts the visited types
+ */
+template <class... Ts>
+struct visit_overload : Ts... {
+    using Ts::operator()...;
+};
+
+/**
+ * @brief plssvm::detail::visit_overload
+ */
+template <class... Ts>
+visit_overload(Ts...) -> visit_overload<Ts...>;
+
 /**
  * @brief Invokes undefined behavior. Used to mark code paths that may never be reachable.
  * @details See: C++23 [`std::unreachable`](https://en.cppreference.com/w/cpp/utility/unreachable)
diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp
index 562a63893..8fa57874e 100644
--- a/src/plssvm/backends/OpenCL/csvm.cpp
+++ b/src/plssvm/backends/OpenCL/csvm.cpp
@@ -262,7 +262,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::s
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const cl_ulong num_rows_reduced = data_d.shape().x - 1;
     const cl_ulong num_features = data_d.shape().y;
     const queue_type &device = devices_[device_id];
@@ -277,7 +277,9 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, device };  // only explicitly store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, device, use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
 
     // convert execution range block to OpenCL's native std::vector
@@ -295,22 +297,22 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
 
         switch (params.kernel_type) {
             case kernel_function_type::linear:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y);
                 break;
             case kernel_function_type::polynomial:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::rbf:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::sigmoid:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::laplacian:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::chi_squared:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
         }
     }
@@ -342,7 +344,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
         const cl_ulong grid_offset_x = offsets.x;
         const cl_ulong grid_offset_y = offsets.y;
 
-        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y);
+        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get_variant(), beta, C_d.get_variant(), grid_offset_x, grid_offset_y);
     }
 
     // convert execution range block to OpenCL's native std::vector
@@ -359,7 +361,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
             const cl_ulong grid_offset_x = offsets.x;
             const cl_ulong grid_offset_y = offsets.y;
 
-            detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y);
+            detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get_variant(), beta, C_d.get_variant(), grid_offset_x, grid_offset_y);
         }
     }
     detail::device_synchronize(device);
@@ -382,7 +384,7 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss
         const cl_ulong grid_offset_x = offsets.x;
         const cl_ulong grid_offset_y = offsets.y;
 
-        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_add_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get(), rhs_d.get(), grid_offset_x, grid_offset_y);
+        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_add_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get_variant(), rhs_d.get_variant(), grid_offset_x, grid_offset_y);
     }
     detail::device_synchronize(device);
 }
@@ -404,7 +406,7 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm:
         const cl_ulong grid_offset_x = offsets.x;
         const cl_ulong grid_offset_y = offsets.y;
 
-        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_scale_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get(), scale, grid_offset_x, grid_offset_y);
+        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_scale_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get_variant(), scale, grid_offset_x, grid_offset_y);
     }
     detail::device_synchronize(device);
 }
@@ -437,22 +439,22 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 
         switch (params.kernel_type) {
             case kernel_function_type::linear:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y);
                 break;
             case kernel_function_type::polynomial:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::rbf:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::sigmoid:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::laplacian:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::chi_squared:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
         }
     }
@@ -488,7 +490,7 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
         const cl_ulong grid_offset_x = offsets.x;
         const cl_ulong grid_offset_y = offsets.y;
 
-        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::w_kernel), native_partial_grid, native_block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, grid_offset_x, grid_offset_y);
+        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::w_kernel), native_partial_grid, native_block, w_d.get_variant(), alpha_d.get_variant(), sv_d.get_variant(), num_classes, num_sv, device_specific_num_sv, sv_offset, grid_offset_x, grid_offset_y);
     }
     detail::device_synchronize(device);
 
@@ -519,22 +521,22 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
         switch (params.kernel_type) {
             case kernel_function_type::linear:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_linear), native_partial_grid, native_block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, grid_offset_x, grid_offset_y);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_linear), native_partial_grid, native_block, out_d.get_variant(), sv_or_w_d.get_variant(), rho_d.get_variant(), predict_points_d.get_variant(), num_classes, num_predict_points, num_features, grid_offset_x, grid_offset_y);
                 break;
             case kernel_function_type::polynomial:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_polynomial), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_polynomial), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::rbf:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_rbf), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_rbf), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::sigmoid:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_sigmoid), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_sigmoid), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::laplacian:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_laplacian), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_laplacian), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::chi_squared:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_chi_squared), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_chi_squared), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
         }
     }
diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
index 6aa67802a..c9439ae9e 100644
--- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
+++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
@@ -21,30 +21,36 @@
 
 #include "fmt/format.h"  // fmt::format
 
-#include <algorithm>  // std::min
+#include <algorithm>  // std::min, std::fill
 #include <array>      // std::array
 #include <cstddef>    // std::size_t
+#include <cstring>    // std::memcpy
 #include <exception>  // std::terminate
 #include <iostream>   // std::cerr, std::endl
+#include <variant>    // std::variant
 #include <vector>     // std::vector
 
 namespace plssvm::opencl::detail {
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const command_queue &queue) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, queue } { }
+device_ptr<T>::device_ptr(const size_type size, const command_queue &queue, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, queue, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const command_queue &queue) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, queue } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const command_queue &queue, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, queue, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const command_queue &queue) :
-    base_type{ shape, padding, &queue } {
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const command_queue &queue, const bool use_usm_allocations) :
+    base_type{ shape, padding, &queue, use_usm_allocations } {
     error_code err{};
     cl_context cont{};
     PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast<void *>(&cont), nullptr), "error retrieving the command queue context")
-    data_ = clCreateBuffer(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), nullptr, &err);
+    if (use_usm_allocations_) {
+        usm_ptr_ = static_cast<T *>(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0));
+    } else {
+        data_ = clCreateBuffer(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), nullptr, &err);
+    }
     PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the buffer")
     this->memset(0);
 }
@@ -56,12 +62,35 @@ device_ptr<T>::~device_ptr() {
         if (data_ != nullptr) {
             PLSSVM_OPENCL_ERROR_CHECK(clReleaseMemObject(data_), "error releasing the buffer")
         }
+        if (use_usm_allocations_ && usm_ptr_ != nullptr) {
+            cl_context cont{};
+            PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast<void *>(&cont), nullptr), "error retrieving the command queue context")
+            clSVMFree(cont, usm_ptr_);
+        }
     } catch (const plssvm::exception &e) {
         std::cout << e.what_with_loc() << std::endl;
         std::terminate();
     }
 }
 
+template <typename T>
+auto device_ptr<T>::get_variant() -> std::variant<device_pointer_type, T *> {
+    if (use_usm_allocations_) {
+        return { usm_ptr_ };
+    } else {
+        return { this->get() };
+    }
+}
+
+template <typename T>
+auto device_ptr<T>::get_variant() const -> std::variant<device_pointer_type, T *> {
+    if (use_usm_allocations_) {
+        return { usm_ptr_ };
+    } else {
+        return { this->get() };
+    }
+}
+
 template <typename T>
 void device_ptr<T>::memset(const int pattern, const size_type pos, const size_type num_bytes) {
     PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
@@ -70,10 +99,14 @@ void device_ptr<T>::memset(const int pattern, const size_type pos, const size_ty
         throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) };
     }
     const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
-    error_code err;
-    const auto correct_value = static_cast<unsigned char>(pattern);
-    err = clEnqueueFillBuffer(queue_->queue, data_, &correct_value, sizeof(unsigned char), pos * sizeof(value_type), rnum_bytes, 0, nullptr, nullptr);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via memset")
+    if (use_usm_allocations_) {
+        std::memset(usm_ptr_ + pos, pattern, rnum_bytes);
+    } else {
+        error_code err;
+        const auto correct_value = static_cast<unsigned char>(pattern);
+        err = clEnqueueFillBuffer(queue_->queue, data_, &correct_value, sizeof(unsigned char), pos * sizeof(value_type), rnum_bytes, 0, nullptr, nullptr);
+        PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via memset")
+    }
     device_synchronize(*queue_);
 }
 
@@ -87,9 +120,13 @@ void device_ptr<T>::fill(const value_type value, const size_type pos, const size
 
     // run GPU kernel
     const size_type rcount = std::min(count, this->size_padded() - pos);
-    error_code err;
-    err = clEnqueueFillBuffer(queue_->queue, data_, &value, sizeof(value_type), pos * sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via fill")
+    if (use_usm_allocations_) {
+        std::fill(usm_ptr_ + pos, usm_ptr_ + pos + rcount, value);
+    } else {
+        error_code err;
+        err = clEnqueueFillBuffer(queue_->queue, data_, &value, sizeof(value_type), pos * sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr);
+        PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via fill")
+    }
     device_synchronize(*queue_);
 }
 
@@ -99,9 +136,13 @@ void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const s
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
-    error_code err;
-    err = clEnqueueWriteBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), data_to_copy, 0, nullptr, nullptr);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data to the device buffer")
+    if (use_usm_allocations_) {
+        std::memcpy(usm_ptr_ + pos, data_to_copy, rcount);
+    } else {
+        error_code err;
+        err = clEnqueueWriteBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), data_to_copy, 0, nullptr, nullptr);
+        PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data to the device buffer")
+    }
     device_synchronize(*queue_);
 }
 
@@ -114,17 +155,32 @@ void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy,
         throw backend_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) };
     }
 
-    const std::array<std::size_t, 3> buffer_origin{ 0, 0, 0 };
-    const std::array<std::size_t, 3> host_origin{ 0, 0, 0 };
-    const std::array<std::size_t, 3> region{ width * sizeof(value_type), height, 1 };
-    const std::size_t buffer_row_pitch = this->shape_padded().x * sizeof(value_type);
-    const std::size_t buffer_slice_pitch = 0;
-    const std::size_t host_row_pitch = spitch * sizeof(value_type);
-    const std::size_t host_slice_pitch = 0;
-
-    error_code err;
-    err = clEnqueueWriteBufferRect(queue_->queue, data_, CL_TRUE, buffer_origin.data(), host_origin.data(), region.data(), buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, data_to_copy, 0, nullptr, nullptr);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the strided data to the device buffer")
+    if (use_usm_allocations_) {
+        if (spitch == width) {
+            // can use normal copy since we have no line strides
+            this->copy_to_device(data_to_copy, 0, width * height);
+        } else {
+            std::vector<value_type> temp(this->shape_padded().x * height, value_type{ 0.0 });
+            value_type *pos = temp.data();
+            for (std::size_t row = 0; row < height; ++row) {
+                std::memcpy(pos, data_to_copy + row * spitch, width * sizeof(value_type));
+                pos += this->shape_padded().x;
+            }
+            this->copy_to_device(temp);
+        }
+    } else {
+        const std::array<std::size_t, 3> buffer_origin{ 0, 0, 0 };
+        const std::array<std::size_t, 3> host_origin{ 0, 0, 0 };
+        const std::array<std::size_t, 3> region{ width * sizeof(value_type), height, 1 };
+        const std::size_t buffer_row_pitch = this->shape_padded().x * sizeof(value_type);
+        const std::size_t buffer_slice_pitch = 0;
+        const std::size_t host_row_pitch = spitch * sizeof(value_type);
+        const std::size_t host_slice_pitch = 0;
+
+        error_code err;
+        err = clEnqueueWriteBufferRect(queue_->queue, data_, CL_TRUE, buffer_origin.data(), host_origin.data(), region.data(), buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, data_to_copy, 0, nullptr, nullptr);
+        PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the strided data to the device buffer")
+    }
     device_synchronize(*queue_);
 }
 
@@ -134,9 +190,13 @@ void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos,
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
-    error_code err;
-    err = clEnqueueReadBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), buffer, 0, nullptr, nullptr);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data from the device buffer")
+    if (use_usm_allocations_) {
+        std::memcpy(buffer, usm_ptr_ + pos, rcount);
+    } else {
+        error_code err;
+        err = clEnqueueReadBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), buffer, 0, nullptr, nullptr);
+        PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data from the device buffer")
+    }
     device_synchronize(*queue_);
 }
 

From b5894e01422611fba501d48065697315affc54a7 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 16:42:17 +0200
Subject: [PATCH 14/93] Only call get_variant() where necessary.

---
 src/plssvm/backends/OpenCL/csvm.cpp | 46 ++++++++++++++---------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp
index 8fa57874e..359bc0268 100644
--- a/src/plssvm/backends/OpenCL/csvm.cpp
+++ b/src/plssvm/backends/OpenCL/csvm.cpp
@@ -297,22 +297,22 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
 
         switch (params.kernel_type) {
             case kernel_function_type::linear:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y);
                 break;
             case kernel_function_type::polynomial:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::rbf:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::sigmoid:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::laplacian:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::chi_squared:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get_variant(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
         }
     }
@@ -344,7 +344,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
         const cl_ulong grid_offset_x = offsets.x;
         const cl_ulong grid_offset_y = offsets.y;
 
-        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get_variant(), beta, C_d.get_variant(), grid_offset_x, grid_offset_y);
+        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y);
     }
 
     // convert execution range block to OpenCL's native std::vector
@@ -361,7 +361,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
             const cl_ulong grid_offset_x = offsets.x;
             const cl_ulong grid_offset_y = offsets.y;
 
-            detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get_variant(), beta, C_d.get_variant(), grid_offset_x, grid_offset_y);
+            detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y);
         }
     }
     detail::device_synchronize(device);
@@ -384,7 +384,7 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss
         const cl_ulong grid_offset_x = offsets.x;
         const cl_ulong grid_offset_y = offsets.y;
 
-        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_add_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get_variant(), rhs_d.get_variant(), grid_offset_x, grid_offset_y);
+        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_add_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get(), rhs_d.get(), grid_offset_x, grid_offset_y);
     }
     detail::device_synchronize(device);
 }
@@ -406,7 +406,7 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm:
         const cl_ulong grid_offset_x = offsets.x;
         const cl_ulong grid_offset_y = offsets.y;
 
-        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_scale_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get_variant(), scale, grid_offset_x, grid_offset_y);
+        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::inplace_matrix_scale_kernel), native_partial_grid, native_block, num_rhs, lhs_d.get(), scale, grid_offset_x, grid_offset_y);
     }
     detail::device_synchronize(device);
 }
@@ -439,22 +439,22 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 
         switch (params.kernel_type) {
             case kernel_function_type::linear:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y);
                 break;
             case kernel_function_type::polynomial:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::rbf:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::sigmoid:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::laplacian:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::chi_squared:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get_variant(), A_d.get_variant(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get_variant(), C_d.get_variant(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_implicit_blas), native_partial_grid, native_block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
         }
     }
@@ -490,7 +490,7 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
         const cl_ulong grid_offset_x = offsets.x;
         const cl_ulong grid_offset_y = offsets.y;
 
-        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::w_kernel), native_partial_grid, native_block, w_d.get_variant(), alpha_d.get_variant(), sv_d.get_variant(), num_classes, num_sv, device_specific_num_sv, sv_offset, grid_offset_x, grid_offset_y);
+        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::w_kernel), native_partial_grid, native_block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, grid_offset_x, grid_offset_y);
     }
     detail::device_synchronize(device);
 
@@ -521,22 +521,22 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
         switch (params.kernel_type) {
             case kernel_function_type::linear:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_linear), native_partial_grid, native_block, out_d.get_variant(), sv_or_w_d.get_variant(), rho_d.get_variant(), predict_points_d.get_variant(), num_classes, num_predict_points, num_features, grid_offset_x, grid_offset_y);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_linear), native_partial_grid, native_block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, grid_offset_x, grid_offset_y);
                 break;
             case kernel_function_type::polynomial:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_polynomial), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_polynomial), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::rbf:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_rbf), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_rbf), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::sigmoid:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_sigmoid), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_sigmoid), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::laplacian:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_laplacian), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_laplacian), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::chi_squared:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_chi_squared), native_partial_grid, native_block, out_d.get_variant(), alpha_d.get_variant(), rho_d.get_variant(), sv_or_w_d.get_variant(), predict_points_d.get_variant(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::predict_kernel_chi_squared), native_partial_grid, native_block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
         }
     }

From ed9b633eec394dd6850c145b31ab31dbf7e2976e Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Oct 2024 17:13:41 +0200
Subject: [PATCH 15/93] Add and improve error check.

---
 src/plssvm/backends/OpenCL/detail/device_ptr.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
index c9439ae9e..88114d6e1 100644
--- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
+++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
@@ -43,15 +43,16 @@ device_ptr<T>::device_ptr(const plssvm::shape shape, const command_queue &queue,
 template <typename T>
 device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const command_queue &queue, const bool use_usm_allocations) :
     base_type{ shape, padding, &queue, use_usm_allocations } {
-    error_code err{};
     cl_context cont{};
     PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast<void *>(&cont), nullptr), "error retrieving the command queue context")
     if (use_usm_allocations_) {
         usm_ptr_ = static_cast<T *>(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0));
+        PLSSVM_ASSERT(usm_ptr_ != nullptr, "error creating OpenCL SVM allocation");
     } else {
+        error_code err{};
         data_ = clCreateBuffer(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), nullptr, &err);
+        PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the buffer")
     }
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the buffer")
     this->memset(0);
 }
 

From d8502751aec9abd091bb63a690fac724e2d91671 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 8 Oct 2024 17:50:14 +0200
Subject: [PATCH 16/93] Use cg_explicit as maximum allocation size constraint.

---
 src/plssvm/detail/data_distribution.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plssvm/detail/data_distribution.cpp b/src/plssvm/detail/data_distribution.cpp
index db326fa59..016260389 100644
--- a/src/plssvm/detail/data_distribution.cpp
+++ b/src/plssvm/detail/data_distribution.cpp
@@ -213,7 +213,7 @@ std::pair<memory_size, std::vector<memory_size>> triangular_data_distribution::c
 }
 
 std::vector<memory_size> triangular_data_distribution::calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(const std::size_t num_features, const std::size_t num_classes) const {
-    return this->calculate_maximum_implicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_classes);
+    return this->calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_classes);
 }
 
 std::vector<memory_size> triangular_data_distribution::calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(const std::size_t num_features, const std::size_t num_classes) const {

From ed2e2a86acb48d0d53ff60044092934baa4114d4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 8 Oct 2024 17:58:58 +0200
Subject: [PATCH 17/93] Improve output by mentioning the maximum guaranteed
 allocation size.

---
 include/plssvm/csvm.hpp | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp
index 098111397..d9999c49a 100644
--- a/include/plssvm/csvm.hpp
+++ b/include/plssvm/csvm.hpp
@@ -879,6 +879,15 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
         // get the maximum possible memory allocation size per device
         const std::vector<detail::memory_size> max_mem_alloc_size_per_device = this->get_max_mem_alloc_size();
 
+        // utility function returning a vector of memory sizes that where the reasons for a failed check
+        const auto get_failed_memory_sizes = [&max_mem_alloc_size_per_device](const std::vector<std::size_t> &failed_devices) {
+            std::vector<detail::memory_size> failed_memory_sizes{};
+            for (const std::size_t device : failed_devices) {
+                failed_memory_sizes.push_back(max_mem_alloc_size_per_device[device]);
+            }
+            return failed_memory_sizes;
+        };
+
         // get the maximum single allocation size per device
         const std::vector<detail::memory_size> max_single_allocation_cg_explicit_size_per_device = data_distribution.calculate_maximum_explicit_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
         const std::vector<detail::memory_size> max_single_allocation_cg_streaming_size_per_device = data_distribution.calculate_maximum_streaming_kernel_matrix_memory_allocation_size_per_place(num_features, num_rhs);
@@ -903,9 +912,11 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
         // check whether the maximum single cg_explicit memory allocation size can be satisfied
         if (const std::vector<std::size_t> failed_cg_explicit_constraints = check_sizes(max_single_allocation_cg_explicit_size_per_device, max_mem_alloc_size_per_device);
             used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) {
+
             // max mem alloc size constraints not fulfilled
             detail::log(verbosity_level::full,
-                        "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_streaming.\n",
+                        "Cannot use cg_explicit due to maximum single memory allocation constraints ({}) on device(s) {}! Falling back to cg_streaming.\n",
+                        format_vector(get_failed_memory_sizes(failed_cg_explicit_constraints)),
                         format_vector(failed_cg_explicit_constraints));
             // can't use cg_explicit
             used_solver = solver_type::cg_streaming;
@@ -914,7 +925,8 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
             used_solver == solver_type::cg_streaming && !failed_cg_streaming_constraints.empty()) {
             // max mem alloc size constraints not fulfilled
             detail::log(verbosity_level::full,
-                        "Cannot use cg_streaming due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n",
+                        "Cannot use cg_streaming due to maximum single memory allocation constraints ({}) on device(s) {}! Falling back to cg_implicit.\n",
+                        format_vector(get_failed_memory_sizes(failed_cg_streaming_constraints)),
                         format_vector(failed_cg_streaming_constraints));
             // can't use cg_streaming
             used_solver = solver_type::cg_implicit;
@@ -925,7 +937,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
             plssvm::detail::log(verbosity_level::full | verbosity_level::warning,
                                 "WARNING: if you are sure that the guaranteed maximum memory allocation size can be safely ignored on your device, "
                                 "this check can be disabled via \"-DPLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=OFF\" during the CMake configuration!\n");
-            throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint for device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) };
+            throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint ({}) for device(s) {} even for the cg_implicit solver!", format_vector(get_failed_memory_sizes(failed_cg_implicit_constraints)), format_vector(failed_cg_implicit_constraints)) };
         }
 #endif
     }

From a34b620425af3d51e7d18b29228aa620f3b175f1 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 8 Oct 2024 18:02:51 +0200
Subject: [PATCH 18/93] Throw an exception if clSVMAlloc failed.

---
 src/plssvm/backends/OpenCL/detail/device_ptr.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
index 88114d6e1..086884dc6 100644
--- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
+++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
@@ -14,6 +14,7 @@
 #include "plssvm/backends/OpenCL/detail/utility.hpp"        // PLSSVM_OPENCL_ERROR_CHECK
 #include "plssvm/backends/OpenCL/exceptions.hpp"            // plssvm::opencl::backend_exception
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
+#include "plssvm/detail/memory_size.hpp"                    // plssvm::detail::memory_size
 #include "plssvm/exceptions/exceptions.hpp"                 // plssvm::exception
 #include "plssvm/shape.hpp"                                 // plssvm::shape
 
@@ -47,6 +48,9 @@ device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding
     PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast<void *>(&cont), nullptr), "error retrieving the command queue context")
     if (use_usm_allocations_) {
         usm_ptr_ = static_cast<T *>(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0));
+        if (usm_ptr_ == nullptr) {
+            throw backend_exception{ fmt::format("Failed to allocate {} of memory using clSVMAlloc(...). Maybe that's larger than CL_DEVICE_MAX_MEM_ALLOC_SIZE?", ::plssvm::detail::memory_size{ this->size_padded() * sizeof(value_type) }) };
+        }
         PLSSVM_ASSERT(usm_ptr_ != nullptr, "error creating OpenCL SVM allocation");
     } else {
         error_code err{};

From 9fcdd7fab2915072d78b45e8f8b97f9ba9c6f0f9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 8 Oct 2024 18:03:14 +0200
Subject: [PATCH 19/93] Rewrite OpenCL context logic to also support
 cg_streaming with multiple GPUs.

---
 .../plssvm/backends/OpenCL/detail/context.hpp |  11 +-
 src/plssvm/backends/OpenCL/csvm.cpp           |   9 +-
 src/plssvm/backends/OpenCL/detail/context.cpp |  11 +-
 src/plssvm/backends/OpenCL/detail/utility.cpp | 195 ++++++++++--------
 4 files changed, 125 insertions(+), 101 deletions(-)

diff --git a/include/plssvm/backends/OpenCL/detail/context.hpp b/include/plssvm/backends/OpenCL/detail/context.hpp
index 2b26e6f93..c49f235dd 100644
--- a/include/plssvm/backends/OpenCL/detail/context.hpp
+++ b/include/plssvm/backends/OpenCL/detail/context.hpp
@@ -15,13 +15,12 @@
 
 #include "CL/cl.h"  // cl_context, cl_platform_id, cl_device_id
 
-#include <vector>  // std::vector
-
 namespace plssvm::opencl::detail {
 
 /**
  * @brief RAII wrapper class around a cl_context.
- * @details Also contains the associated platform and a list of all associated devices.
+ * @details Also contains the associated platform and device.
+ * @note Each context is guaranteed to only contain a single device, i.e., on multi-device system, one context for each device is created.
  */
 class context {
   public:
@@ -35,7 +34,7 @@ class context {
      * @param[in] platform the OpenCL platform associated with this OpenCL context
      * @param[in] devices the list of devices associated with this OpenCL cl_context
      */
-    context(cl_context device_context, cl_platform_id platform, std::vector<cl_device_id> devices);
+    context(cl_context device_context, cl_platform_id platform, cl_device_id device);
 
     /**
      * @brief Delete copy-constructor to make context a move only type.
@@ -78,8 +77,8 @@ class context {
     cl_context device_context{};
     /// The OpenCL platform associated with this context.
     cl_platform_id platform{};
-    /// All devices associated with this context.
-    std::vector<cl_device_id> devices{};
+    /// The device associated with this context.
+    cl_device_id device{};
 };
 
 }  // namespace plssvm::opencl::detail
diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp
index 359bc0268..cb032be85 100644
--- a/src/plssvm/backends/OpenCL/csvm.cpp
+++ b/src/plssvm/backends/OpenCL/csvm.cpp
@@ -106,16 +106,9 @@ void csvm::init(const target_platform target) {
     // get all available OpenCL contexts for the current target including devices with respect to the requested target platform
     std::tie(contexts_, target_) = detail::get_contexts(target);
 
-    // currently, only EXACTLY one OpenCL context is allowed
+    // at least one context must be created
     if (contexts_.empty()) {
         throw backend_exception{ fmt::format("No OpenCL context for the target {} could be found!", target_) };
-    } else if (contexts_.size() > 1) {
-        throw backend_exception{ fmt::format("Currently only a single OpenCL context is allowed, but {} were found for the target {}!", contexts_.size(), target_) };
-    }
-
-    // throw exception if no devices for the requested target could be found
-    if (contexts_[0].devices.empty()) {
-        throw backend_exception{ fmt::format("OpenCL backend selected but no devices for the target {} were found!", target) };
     }
 
     // print OpenCL info
diff --git a/src/plssvm/backends/OpenCL/detail/context.cpp b/src/plssvm/backends/OpenCL/detail/context.cpp
index e534e079d..e2184a0f4 100644
--- a/src/plssvm/backends/OpenCL/detail/context.cpp
+++ b/src/plssvm/backends/OpenCL/detail/context.cpp
@@ -12,25 +12,24 @@
 
 #include <memory>   // std::addressof
 #include <utility>  // std::exchange, std::move
-#include <vector>   // std::vector
 
 namespace plssvm::opencl::detail {
 
-context::context(cl_context p_device_context, cl_platform_id p_platform, std::vector<cl_device_id> p_devices) :
+context::context(cl_context p_device_context, cl_platform_id p_platform, cl_device_id p_device) :
     device_context{ p_device_context },
     platform{ p_platform },
-    devices{ std::move(p_devices) } { }
+    device{ p_device } { }
 
 context::context(context &&other) noexcept :
     device_context{ std::exchange(other.device_context, nullptr) },
     platform{ std::exchange(other.platform, nullptr) },
-    devices{ std::move(other.devices) } { }
+    device{ other.device } { }
 
-context &context::operator=(context &&other)noexcept {
+context &context::operator=(context &&other) noexcept {
     if (this != std::addressof(other)) {
         other.device_context = std::exchange(other.device_context, nullptr);
         platform = std::exchange(other.platform, nullptr);
-        devices = std::move(other.devices);
+        device = std::move(other.device);
     }
     return *this;
 }
diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp
index 9a62c77cf..41354b4e4 100644
--- a/src/plssvm/backends/OpenCL/detail/utility.cpp
+++ b/src/plssvm/backends/OpenCL/detail/utility.cpp
@@ -139,10 +139,12 @@ namespace plssvm::opencl::detail {
     for (auto &[platform, devices] : platform_devices) {
         // create context and associated OpenCL platform with it
         std::array<cl_context_properties, 3> context_properties = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform.first), 0 };
-        cl_context cont = clCreateContext(context_properties.data(), static_cast<cl_uint>(devices.size()), devices.data(), nullptr, nullptr, &err);
-        PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the OpenCL context")
-        // add OpenCL context to vector of context wrappers
-        contexts.emplace_back(cont, platform.first, std::move(devices));
+        for (auto &device : devices) {
+            cl_context cont = clCreateContext(context_properties.data(), cl_uint{ 1 }, &device, nullptr, nullptr, &err);
+            PLSSVM_OPENCL_ERROR_CHECK(err, "error creating the OpenCL context")
+            // add OpenCL context to vector of context wrappers
+            contexts.emplace_back(cont, platform.first, device);
+        }
     }
 
     return std::make_pair(std::move(contexts), target);
@@ -208,12 +210,7 @@ std::vector<std::pair<compute_kernel_name, std::string>> kernel_type_to_function
 }
 
 std::vector<command_queue> create_command_queues(const std::vector<context> &contexts, const kernel_function_type kernel_function, const std::vector<std::pair<compute_kernel_name, std::string>> &kernel_names) {
-    std::vector<command_queue> queues;
-    for (std::vector<cl_device_id>::size_type device = 0; device < contexts[0].devices.size(); ++device) {
-        queues.emplace_back(contexts[0], contexts[0].devices[device]);
-    }
-    PLSSVM_ASSERT(!queues.empty(), "At least one command queue must be available!");
-
+    // a small helper function for better error messages
     const auto cl_build_program_error_message = [](cl_program prog, cl_device_id device, const std::size_t device_idx) {
         // determine the size of the log
         std::size_t log_size{};
@@ -228,28 +225,41 @@ std::vector<command_queue> create_command_queues(const std::vector<context> &con
         }
     };
 
-    // determine OpenCL compile options
-    std::string compile_options{ "-cl-mad-enable -cl-no-signed-zeros" };
+    //**************************************************************************//
+    //                   determine per device compile options                   //
+    //**************************************************************************//
+
+    // determine OpenCL compile options per device
+    std::string global_compile_options{ "-cl-mad-enable -cl-no-signed-zeros" };
 #if defined(PLSSVM_ENABLE_FAST_MATH)
-    compile_options += " -cl-fast-relaxed-math";
+    global_compile_options += " -cl-fast-relaxed-math";
 #endif
 
+    std::vector<std::string> compile_options(contexts.size(), global_compile_options);
+
     // only use PTX inline assembly if enabled during CMake configuration
 #if defined(PLSSVM_OPENCL_BACKEND_USE_PTX_INLINE_ASSEMBLY)
-    std::size_t platform_vendor_size{ 0 };
-    clGetPlatformInfo(contexts[0].platform, CL_PLATFORM_VENDOR, 0, nullptr, &platform_vendor_size);
-    std::string platform_vendor(platform_vendor_size, '\0');
-    clGetPlatformInfo(contexts[0].platform, CL_PLATFORM_VENDOR, platform_vendor_size, platform_vendor.data(), nullptr);
-    const bool use_inline_assembly = ::plssvm::detail::contains(::plssvm::detail::as_lower_case(platform_vendor), "nvidia");
-    if (use_inline_assembly) {
-        compile_options += " -DPLSSVM_USE_NVIDIA_PTX_INLINE_ASSEMBLY";
-        plssvm::detail::log(verbosity_level::full,
-                            "Enabling atomicAdd acceleration using PTX inline assembly.\n");
+    for (std::size_t idx = 0; idx < contexts.size(); ++idx) {
+        auto &context = contexts[idx];
+
+        std::size_t platform_vendor_size{ 0 };
+        clGetPlatformInfo(context.platform, CL_PLATFORM_VENDOR, 0, nullptr, &platform_vendor_size);
+        std::string platform_vendor(platform_vendor_size, '\0');
+        clGetPlatformInfo(context.platform, CL_PLATFORM_VENDOR, platform_vendor_size, platform_vendor.data(), nullptr);
+        const bool use_inline_assembly = ::plssvm::detail::contains(::plssvm::detail::as_lower_case(platform_vendor), "nvidia");
+        if (use_inline_assembly) {
+            compile_options[idx] += " -DPLSSVM_USE_NVIDIA_PTX_INLINE_ASSEMBLY";
+            plssvm::detail::log(verbosity_level::full,
+                                "Enabling atomicAdd acceleration using PTX inline assembly for device {}.\n",
+                                idx);
+        }
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "opencl", fmt::format("use_inline_assembly_device_{}", idx), use_inline_assembly }));
     }
-    PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "opencl", "use_inline_assembly", use_inline_assembly }));
 #endif
 
-    error_code err, err_bin;
+    //**************************************************************************//
+    //                        assemble the OpenCL kernel                        //
+    //**************************************************************************//
 
     // note: unsigned long long may NOT be used in an OpenCL kernel (use ulong instead)
     // note: real_type temp{ 0.0 } may NOT be used in an OpenCL kernel (use real_type temp = 0.0 instead)
@@ -347,23 +357,25 @@ std::vector<command_queue> create_command_queues(const std::vector<context> &con
     ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE", fmt::format("{}", PADDING_SIZE));
 
     // get all device names
-    std::vector<std::string> device_names(contexts[0].devices.size());
-    for (typename std::vector<std::string>::size_type device_id = 0; device_id < device_names.size(); ++device_id) {
+    std::vector<std::string> device_names{};
+    for (auto &context : contexts) {
         // get device name
         std::size_t name_length{};
-        PLSSVM_OPENCL_ERROR_CHECK(clGetDeviceInfo(contexts[0].devices[device_id], CL_DEVICE_NAME, 0, nullptr, &name_length), "error obtaining device name size")
+        PLSSVM_OPENCL_ERROR_CHECK(clGetDeviceInfo(context.device, CL_DEVICE_NAME, 0, nullptr, &name_length), "error obtaining device name size")
         std::string device_name(name_length - 1, '\0');
-        PLSSVM_OPENCL_ERROR_CHECK(clGetDeviceInfo(contexts[0].devices[device_id], CL_DEVICE_NAME, name_length, device_name.data(), nullptr), "error obtaining device name")
-        device_names[device_id] = std::move(device_name);
+        PLSSVM_OPENCL_ERROR_CHECK(clGetDeviceInfo(context.device, CL_DEVICE_NAME, name_length, device_name.data(), nullptr), "error obtaining device name")
+        device_names.push_back(device_name);
     }
 
-    // append number of device to influence checksum calculation
+    // append other information to make the kernel string unique
     kernel_src_string.append(fmt::format("\n\n"
-                                         "// devices: [{}]\n"
+                                         "// num_devices: {}\n"
+                                         "// device_names: [{}]\n"
                                          "// OpenCL library: \"{}\"\n"
                                          "// OpenCL target version: {}\n"
                                          "// CMAKE_BUILD_TYPE: {}\n"
                                          "// compile_options: \"{}\"\n",
+                                         contexts.size(),
                                          fmt::join(device_names, ", "),
                                          PLSSVM_OPENCL_LIBRARY,
                                          CL_TARGET_OPENCL_VERSION,
@@ -376,14 +388,13 @@ std::vector<command_queue> create_command_queues(const std::vector<context> &con
     // convert string to const char*
     const char *kernel_src_ptr = kernel_src_string.c_str();
 
-    // data to build the final OpenCL program
-    std::vector<std::size_t> binary_sizes(contexts[0].devices.size());
-    std::vector<std::vector<unsigned char>> binaries(contexts[0].devices.size());
-    std::vector<unsigned char *> binaries_ptr(binaries.size());
-
     // create caching folder in the temporary directory and change the permissions such that everybody has read/write access
     const std::filesystem::path cache_dir_name = std::filesystem::temp_directory_path() / "plssvm_opencl_cache" / checksum;
 
+    //**************************************************************************//
+    //             check whether a cached OpenCL kernel can be used             //
+    //**************************************************************************//
+
     // potential reasons why OpenCL caching could fail
     enum class caching_status {
         success,
@@ -414,41 +425,59 @@ std::vector<command_queue> create_command_queues(const std::vector<context> &con
         // get directory iterator
         auto dirIter = std::filesystem::directory_iterator(cache_dir_name);
         // get files in directory -> account for stored preprocessed source file
-        if (static_cast<std::size_t>(std::count_if(std::filesystem::begin(dirIter), std::filesystem::end(dirIter), [](const auto &entry) { return entry.is_regular_file(); })) != contexts[0].devices.size() + 1) {
+        if (static_cast<std::size_t>(std::count_if(std::filesystem::begin(dirIter), std::filesystem::end(dirIter), [](const auto &entry) { return entry.is_regular_file(); })) != contexts.size() + 1) {
             use_cached_binaries = caching_status::error_invalid_number_of_cached_files;
         }
     }
 
+    //**************************************************************************//
+    //      fill the OpenCL binaries (either compile or use cached values)      //
+    //**************************************************************************//
+
+    // data to build the final OpenCL program
+    std::vector<std::size_t> binary_sizes(contexts.size());
+    std::vector<std::vector<unsigned char>> binaries(contexts.size());
+    std::vector<unsigned char *> binaries_ptr(binaries.size());
+
+    error_code err, err_bin;
+
     if (use_cached_binaries != caching_status::success) {
         plssvm::detail::log(verbosity_level::full,
                             "Building OpenCL kernels from source (reason: {}).\n",
                             caching_status_to_string(use_cached_binaries));
 
-        // create and build program
-        cl_program program = clCreateProgramWithSource(contexts[0], 1, &kernel_src_ptr, nullptr, &err);
-        PLSSVM_OPENCL_ERROR_CHECK(err, "error creating program from source")
+        // build OpenCL kernels for each context, i.e., each device
+        for (std::size_t idx = 0; idx < contexts.size(); ++idx) {
+            auto &context = contexts[idx];
+            auto &device = context.device;
 
-        err = clBuildProgram(program, static_cast<cl_uint>(contexts[0].devices.size()), contexts[0].devices.data(), compile_options.c_str(), nullptr, nullptr);
+            // create and build program
+            cl_program program = clCreateProgramWithSource(context, 1, &kernel_src_ptr, nullptr, &err);
+            PLSSVM_OPENCL_ERROR_CHECK(err, "error creating program from source")
 
-        if (!err) {
-            // check all devices for errors
-            for (std::vector<context>::size_type device = 0; device < contexts[0].devices.size(); ++device) {
-                cl_build_program_error_message(program, contexts[0].devices[device], device);
+            err = clBuildProgram(program, 1, &device, compile_options[idx].c_str(), nullptr, nullptr);
+
+            if (!err) {
+                // check device for errors
+                cl_build_program_error_message(program, device, idx);
+                PLSSVM_OPENCL_ERROR_CHECK(err, "error building program")
             }
-            PLSSVM_OPENCL_ERROR_CHECK(err, "error building program")
-        }
 
-        // get sizes of binaries
-        err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, contexts[0].devices.size() * sizeof(std::size_t), binary_sizes.data(), nullptr);
-        PLSSVM_OPENCL_ERROR_CHECK(err, "error retrieving the kernel (binary) kernel sizes")
-        for (std::vector<std::vector<unsigned char>>::size_type i = 0; i < binaries.size(); ++i) {
-            binaries[i] = std::vector<unsigned char>(binary_sizes[i]);
-            binaries_ptr[i] = binaries[i].data();  // only necessary for OpenCL's void ** calls!
-        }
+            // get sizes of binaries
+            err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(std::size_t), &binary_sizes[idx], nullptr);
+            PLSSVM_OPENCL_ERROR_CHECK(err, "error retrieving the kernel (binary) kernel sizes")
+            binaries[idx] = std::vector<unsigned char>(binary_sizes[idx]);
+            binaries_ptr[idx] = binaries[idx].data();  // only necessary for OpenCL's void ** calls!
+
+            // get binaries
+            err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &binaries_ptr[idx], nullptr);
+            PLSSVM_OPENCL_ERROR_CHECK(err, "error retrieving the kernel binaries")
 
-        // get binaries
-        err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, contexts[0].devices.size() * sizeof(unsigned char *), binaries_ptr.data(), nullptr);
-        PLSSVM_OPENCL_ERROR_CHECK(err, "error retrieving the kernel binaries")
+            // release resource
+            if (program) {
+                PLSSVM_OPENCL_ERROR_CHECK(clReleaseProgram(program), "error releasing OpenCL program resources")
+            }
+        }
 
         // write binaries to file
         if (!std::filesystem::exists(cache_dir_name)) {
@@ -474,11 +503,6 @@ std::vector<command_queue> create_command_queues(const std::vector<context> &con
         plssvm::detail::log(verbosity_level::full,
                             "Cached OpenCL kernel binaries in {}.\n",
                             cache_dir_name);
-
-        // release resource
-        if (program) {
-            PLSSVM_OPENCL_ERROR_CHECK(clReleaseProgram(program), "error releasing OpenCL program resources")
-        }
     } else {
         plssvm::detail::log(verbosity_level::full,
                             "Using cached OpenCL kernel binaries from {}.\n",
@@ -515,32 +539,41 @@ std::vector<command_queue> create_command_queues(const std::vector<context> &con
         }
     }
 
-    // build from binaries
-    cl_program binary_program = clCreateProgramWithBinary(contexts[0], static_cast<cl_uint>(contexts[0].devices.size()), contexts[0].devices.data(), binary_sizes.data(), const_cast<const unsigned char **>(binaries_ptr.data()), &err_bin, &err);
-    PLSSVM_OPENCL_ERROR_CHECK(err_bin, "error loading binaries")
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error creating binary program")
-    err = clBuildProgram(binary_program, static_cast<cl_uint>(contexts[0].devices.size()), contexts[0].devices.data(), nullptr, nullptr, nullptr);
-    if (!err) {
-        // check all devices for errors
-        for (std::vector<context>::size_type device = 0; device < contexts[0].devices.size(); ++device) {
-            cl_build_program_error_message(binary_program, contexts[0].devices[device], device);
+    std::vector<command_queue> queues{};
+    // compile kernels for each context, i.e., each device
+    for (std::size_t idx = 0; idx < contexts.size(); ++idx) {
+        auto &context = contexts[idx];
+        auto &device = context.device;
+
+        // build from binaries
+        cl_program binary_program = clCreateProgramWithBinary(context, cl_uint{ 1 }, &device, &binary_sizes[idx], const_cast<const unsigned char **>(&binaries_ptr[idx]), &err_bin, &err);
+        PLSSVM_OPENCL_ERROR_CHECK(err_bin, "error loading binaries")
+        PLSSVM_OPENCL_ERROR_CHECK(err, "error creating binary program")
+        err = clBuildProgram(binary_program, cl_uint{ 1 }, &device, nullptr, nullptr, nullptr);
+        if (!err) {
+            // check device for errors
+            cl_build_program_error_message(binary_program, device, idx);
+            PLSSVM_OPENCL_ERROR_CHECK(err, "error building program")
         }
-        PLSSVM_OPENCL_ERROR_CHECK(err, "error building program")
-    }
 
-    // build all kernels, one for each device
-    for (std::vector<cl_device_id>::size_type device = 0; device < contexts[0].devices.size(); ++device) {
+        // each context contains exactly one device
+        command_queue queue{ context, device };
+
+        // build all kernels, one for each device
         for (const std::pair<compute_kernel_name, std::string> &name : kernel_names) {
             // create kernel
-            queues[device].add_kernel(name.first, kernel{ clCreateKernel(binary_program, name.second.c_str(), &err) });
-            PLSSVM_OPENCL_ERROR_CHECK(err, fmt::format("error creating OpenCL kernel {} for device {}", name.second, device))
+            queue.add_kernel(name.first, kernel{ clCreateKernel(binary_program, name.second.c_str(), &err) });
+            PLSSVM_OPENCL_ERROR_CHECK(err, fmt::format("error creating OpenCL kernel {} for device {}", name.second, idx))
         }
-    }
 
-    // release resource
-    if (binary_program) {
-        PLSSVM_OPENCL_ERROR_CHECK(clReleaseProgram(binary_program), "error releasing OpenCL binary program resources")
+        // release resource
+        if (binary_program) {
+            PLSSVM_OPENCL_ERROR_CHECK(clReleaseProgram(binary_program), "error releasing OpenCL binary program resources")
+        }
+
+        queues.push_back(std::move(queue));
     }
+    PLSSVM_ASSERT(!queues.empty(), "At least one command queue must be available!");
 
     return queues;
 }

From 1dd509ceae04235ac122cbe7735bf189c9fddd13 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 9 Oct 2024 15:10:50 +0200
Subject: [PATCH 20/93] Use the correct OpenCL functions to perform SVM pointer
 operations and improve simplicity of implementation by using a
 std::variant<cl_mem, T*> as device_pointer_type.

---
 .../backends/OpenCL/detail/device_ptr.hpp     | 21 +---
 include/plssvm/backends/gpu_device_ptr.hpp    | 36 ++++---
 src/plssvm/backends/OpenCL/csvm.cpp           | 16 +--
 .../backends/OpenCL/detail/device_ptr.cpp     | 98 +++++++++----------
 4 files changed, 74 insertions(+), 97 deletions(-)

diff --git a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp
index a88f4ce0f..ab7ee5f4e 100644
--- a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp
+++ b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp
@@ -29,9 +29,9 @@ namespace plssvm::opencl::detail {
  * @tparam T the type of the kernel pointer to wrap
  */
 template <typename T>
-class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queue *, cl_mem, device_ptr<T>> {
+class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queue *, std::variant<cl_mem, T*>, device_ptr<T>> {
     /// The template base type of the OpenCL device_ptr class.
-    using base_type = ::plssvm::detail::gpu_device_ptr<T, const command_queue *, cl_mem, device_ptr<T>>;
+    using base_type = ::plssvm::detail::gpu_device_ptr<T, const command_queue *, std::variant<cl_mem, T*>, device_ptr<T>>;
 
     using base_type::data_;
     using base_type::queue_;
@@ -104,19 +104,6 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queu
      */
     ~device_ptr() override;
 
-    /**
-     * @brief Get a pointer to the device memory.
-     * @details If USM allocations are used, returns a `T*` otherwise returns a `cl_mem` object.
-     * @return a variant containing the device memory pointer (`[[nodiscard]]`)
-     */
-    [[nodiscard]] std::variant<device_pointer_type, T *> get_variant();
-    /**
-     * @brief Get a pointer to the device memory.
-     * @details If USM allocations are used, returns a `T*` otherwise returns a `cl_mem` object.
-     * @return a variant containing the device memory pointer (`[[nodiscard]]`)
-     */
-    [[nodiscard]] std::variant<device_pointer_type, T *> get_variant() const;
-
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::memset(int, size_type, size_type)
      */
@@ -141,10 +128,6 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, const command_queu
      * @copydoc plssvm::detail::gpu_device_ptr::copy_to_other_device(derived_gpu_device_ptr &, size_type, size_type) const
      */
     void copy_to_other_device(device_ptr &target, size_type pos, size_type count) const override;
-
-  private:
-    /// The USM pointer used if `use_usm_allocations_` is `true`.
-    T *usm_ptr_{ nullptr };
 };
 
 extern template class device_ptr<float>;
diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp
index c4a277e06..d6e5045f1 100644
--- a/include/plssvm/backends/gpu_device_ptr.hpp
+++ b/include/plssvm/backends/gpu_device_ptr.hpp
@@ -19,8 +19,12 @@
 #include "plssvm/matrix.hpp"                 // plssvm::layout_type, plssvm::matrix
 #include "plssvm/shape.hpp"                  // plssvm::shape
 
-#include <cstddef>  // std::size_t
-#include <vector>   // std::vector
+#include "fmt/format.h"  // fmt::format
+
+#include <algorithm>  // std::swap
+#include <cstddef>    // std::size_t
+#include <utility>    // std::exchange
+#include <vector>     // std::vector
 
 namespace plssvm::detail {
 
@@ -231,7 +235,7 @@ class gpu_device_ptr {
      */
     void fill(value_type value, size_type pos = 0);
     /**
-     * @brief Fill up-to @p count values to @p value starting at position @p pos.
+     * @brief Fill up-to @p count values of @p value starting at position @p pos.
      * @details Fill `[pos, rcount)` where `rcount` is the smaller value of @p count and `device_ptr::size() - pos`.
      * @param[in] value the fill value
      * @param[in] pos the position to start the fill
@@ -426,14 +430,14 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::swap(
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::memset(const int pattern, const size_type pos) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->memset(pattern, pos, this->size_padded() * sizeof(value_type));
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::fill(const value_type value, const size_type pos) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->fill(value, pos, this->size_padded());
 }
@@ -441,7 +445,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::fill(
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 template <layout_type layout>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const matrix<value_type, layout> &data_to_copy) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (data_to_copy.size_padded() < this->size_padded()) {
         throw gpu_device_ptr_exception{ fmt::format("Too few data to perform copy (needed: {}, provided: {})!", this->size_padded(), data_to_copy.size_padded()) };
@@ -451,14 +455,14 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const std::vector<value_type> &data_to_copy) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->copy_to_device(data_to_copy, 0, this->size_padded());
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const std::vector<value_type> &data_to_copy, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
     if (data_to_copy.size() < rcount) {
@@ -469,7 +473,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const_host_pointer_type data_to_copy) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     this->copy_to_device(data_to_copy, 0, this->size_padded());
@@ -478,7 +482,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 template <layout_type layout>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device_strided(const matrix<value_type, layout> &data_to_copy, const std::size_t start_row, const std::size_t num_rows) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (start_row + num_rows > data_to_copy.num_rows()) {
         throw gpu_device_ptr_exception{ fmt::format("Tried to copy lines {}-{} (zero-based index) to the device, but the matrix has only {} lines!", start_row, start_row + num_rows - 1, data_to_copy.num_rows()) };
@@ -504,7 +508,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device_strided(const std::vector<value_type> &data_to_copy, std::size_t spitch, std::size_t width, std::size_t height) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (width > spitch) {
         throw gpu_device_ptr_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) };
@@ -519,7 +523,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 template <layout_type layout>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(matrix<value_type, layout> &buffer) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (buffer.size_padded() < this->size_padded()) {
         throw gpu_device_ptr_exception{ fmt::format("Buffer too small to perform copy (needed: {}, provided: {})!", this->size_padded(), buffer.size_padded()) };
@@ -529,14 +533,14 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(std::vector<value_type> &buffer) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->copy_to_host(buffer, 0, this->size_padded());
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(std::vector<value_type> &buffer, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
     if (buffer.size() < rcount) {
@@ -547,7 +551,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(host_pointer_type buffer) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
     this->copy_to_host(buffer, 0, this->size_padded());
@@ -555,7 +559,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_other_device(derived_gpu_device_ptr &target) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?");
 
     this->copy_to_other_device(target, 0, this->size_padded());
diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp
index cb032be85..2daddbcaa 100644
--- a/src/plssvm/backends/OpenCL/csvm.cpp
+++ b/src/plssvm/backends/OpenCL/csvm.cpp
@@ -290,22 +290,22 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
 
         switch (params.kernel_type) {
             case kernel_function_type::linear:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y);
                 break;
             case kernel_function_type::polynomial:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, params.degree, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::rbf:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::sigmoid:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma), params.coef0);
                 break;
             case kernel_function_type::laplacian:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
             case kernel_function_type::chi_squared:
-                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get_variant(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
+                detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::assemble_kernel_matrix_explicit), native_partial_grid, native_block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, grid_offset_x, grid_offset_y, std::get<real_type>(params.gamma));
                 break;
         }
     }
@@ -337,7 +337,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
         const cl_ulong grid_offset_x = offsets.x;
         const cl_ulong grid_offset_y = offsets.y;
 
-        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y);
+        detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::symm_kernel_explicit), native_partial_grid, native_block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y);
     }
 
     // convert execution range block to OpenCL's native std::vector
@@ -354,7 +354,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
             const cl_ulong grid_offset_x = offsets.x;
             const cl_ulong grid_offset_y = offsets.y;
 
-            detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get_variant(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y);
+            detail::run_kernel(device, device.get_kernel(detail::compute_kernel_name::mirror_symm_kernel_explicit), native_partial_grid, native_mirror_block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), grid_offset_x, grid_offset_y);
         }
     }
     detail::device_synchronize(device);
diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
index 086884dc6..6e417524b 100644
--- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
+++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
@@ -22,10 +22,9 @@
 
 #include "fmt/format.h"  // fmt::format
 
-#include <algorithm>  // std::min, std::fill
+#include <algorithm>  // std::min
 #include <array>      // std::array
 #include <cstddef>    // std::size_t
-#include <cstring>    // std::memcpy
 #include <exception>  // std::terminate
 #include <iostream>   // std::cerr, std::endl
 #include <variant>    // std::variant
@@ -47,11 +46,11 @@ device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding
     cl_context cont{};
     PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast<void *>(&cont), nullptr), "error retrieving the command queue context")
     if (use_usm_allocations_) {
-        usm_ptr_ = static_cast<T *>(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0));
-        if (usm_ptr_ == nullptr) {
+        T* usm_ptr = static_cast<T *>(clSVMAlloc(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), 0));
+        if (usm_ptr == nullptr) {
             throw backend_exception{ fmt::format("Failed to allocate {} of memory using clSVMAlloc(...). Maybe that's larger than CL_DEVICE_MAX_MEM_ALLOC_SIZE?", ::plssvm::detail::memory_size{ this->size_padded() * sizeof(value_type) }) };
         }
-        PLSSVM_ASSERT(usm_ptr_ != nullptr, "error creating OpenCL SVM allocation");
+        data_ = usm_ptr;
     } else {
         error_code err{};
         data_ = clCreateBuffer(cont, CL_MEM_READ_WRITE, this->size_padded() * sizeof(value_type), nullptr, &err);
@@ -64,13 +63,18 @@ template <typename T>
 device_ptr<T>::~device_ptr() {
     // avoid compiler warnings
     try {
-        if (data_ != nullptr) {
-            PLSSVM_OPENCL_ERROR_CHECK(clReleaseMemObject(data_), "error releasing the buffer")
-        }
-        if (use_usm_allocations_ && usm_ptr_ != nullptr) {
-            cl_context cont{};
-            PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast<void *>(&cont), nullptr), "error retrieving the command queue context")
-            clSVMFree(cont, usm_ptr_);
+        if (use_usm_allocations_) {
+            T* usm_ptr = std::get<T*>(data_);
+            if (usm_ptr != nullptr) {
+                cl_context cont{};
+                PLSSVM_OPENCL_ERROR_CHECK(clGetCommandQueueInfo(queue_->queue, CL_QUEUE_CONTEXT, sizeof(cl_context), static_cast<void *>(&cont), nullptr), "error retrieving the command queue context")
+                clSVMFree(cont, usm_ptr);
+            }
+        } else {
+            cl_mem mem = std::get<cl_mem>(data_);
+            if (mem != nullptr) {
+                PLSSVM_OPENCL_ERROR_CHECK(clReleaseMemObject(mem), "error releasing the buffer")
+            }
         }
     } catch (const plssvm::exception &e) {
         std::cout << e.what_with_loc() << std::endl;
@@ -78,46 +82,29 @@ device_ptr<T>::~device_ptr() {
     }
 }
 
-template <typename T>
-auto device_ptr<T>::get_variant() -> std::variant<device_pointer_type, T *> {
-    if (use_usm_allocations_) {
-        return { usm_ptr_ };
-    } else {
-        return { this->get() };
-    }
-}
-
-template <typename T>
-auto device_ptr<T>::get_variant() const -> std::variant<device_pointer_type, T *> {
-    if (use_usm_allocations_) {
-        return { usm_ptr_ };
-    } else {
-        return { this->get() };
-    }
-}
-
 template <typename T>
 void device_ptr<T>::memset(const int pattern, const size_type pos, const size_type num_bytes) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (pos >= this->size_padded()) {
         throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) };
     }
     const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
+
+    const auto correct_value = static_cast<unsigned char>(pattern);
+    error_code err;
     if (use_usm_allocations_) {
-        std::memset(usm_ptr_ + pos, pattern, rnum_bytes);
+        err = clEnqueueSVMMemFill(queue_->queue, std::get<T*>(data_) + pos, &correct_value, sizeof(unsigned char), rnum_bytes, 0, nullptr, nullptr);
     } else {
-        error_code err;
-        const auto correct_value = static_cast<unsigned char>(pattern);
-        err = clEnqueueFillBuffer(queue_->queue, data_, &correct_value, sizeof(unsigned char), pos * sizeof(value_type), rnum_bytes, 0, nullptr, nullptr);
-        PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via memset")
+        err = clEnqueueFillBuffer(queue_->queue, std::get<cl_mem>(data_), &correct_value, sizeof(unsigned char), pos * sizeof(value_type), rnum_bytes, 0, nullptr, nullptr);
     }
+    PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via memset")
     device_synchronize(*queue_);
 }
 
 template <typename T>
 void device_ptr<T>::fill(const value_type value, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (pos >= this->size_padded()) {
         throw backend_exception{ fmt::format("Illegal access in fill!: {} >= {}", pos, this->size_padded()) };
@@ -125,35 +112,37 @@ void device_ptr<T>::fill(const value_type value, const size_type pos, const size
 
     // run GPU kernel
     const size_type rcount = std::min(count, this->size_padded() - pos);
+
+    error_code err;
     if (use_usm_allocations_) {
-        std::fill(usm_ptr_ + pos, usm_ptr_ + pos + rcount, value);
+        err = clEnqueueSVMMemFill(queue_->queue, std::get<T*>(data_) + pos, &value, sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr);
     } else {
-        error_code err;
-        err = clEnqueueFillBuffer(queue_->queue, data_, &value, sizeof(value_type), pos * sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr);
-        PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via fill")
+        err = clEnqueueFillBuffer(queue_->queue, std::get<cl_mem>(data_), &value, sizeof(value_type), pos * sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr);
     }
+    PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via fill")
     device_synchronize(*queue_);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
+
+    error_code err;
     if (use_usm_allocations_) {
-        std::memcpy(usm_ptr_ + pos, data_to_copy, rcount);
+        err = clEnqueueSVMMemcpy(queue_->queue, CL_TRUE, std::get<T*>(data_) + pos, data_to_copy, rcount * sizeof(value_type), 0, nullptr, nullptr);
     } else {
-        error_code err;
-        err = clEnqueueWriteBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), data_to_copy, 0, nullptr, nullptr);
-        PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data to the device buffer")
+        err = clEnqueueWriteBuffer(queue_->queue, std::get<cl_mem>(data_), CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), data_to_copy, 0, nullptr, nullptr);
     }
+    PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data to the device buffer")
     device_synchronize(*queue_);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t spitch, const std::size_t width, const std::size_t height) {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     if (width > spitch) {
@@ -183,7 +172,7 @@ void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy,
         const std::size_t host_slice_pitch = 0;
 
         error_code err;
-        err = clEnqueueWriteBufferRect(queue_->queue, data_, CL_TRUE, buffer_origin.data(), host_origin.data(), region.data(), buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, data_to_copy, 0, nullptr, nullptr);
+        err = clEnqueueWriteBufferRect(queue_->queue, std::get<cl_mem>(data_), CL_TRUE, buffer_origin.data(), host_origin.data(), region.data(), buffer_row_pitch, buffer_slice_pitch, host_row_pitch, host_slice_pitch, data_to_copy, 0, nullptr, nullptr);
         PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the strided data to the device buffer")
     }
     device_synchronize(*queue_);
@@ -191,23 +180,24 @@ void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy,
 
 template <typename T>
 void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
+
+    error_code err;
     if (use_usm_allocations_) {
-        std::memcpy(buffer, usm_ptr_ + pos, rcount);
+        err = clEnqueueSVMMemcpy(queue_->queue, CL_TRUE, buffer, std::get<T*>(data_) + pos, rcount * sizeof(value_type), 0, nullptr, nullptr);
     } else {
-        error_code err;
-        err = clEnqueueReadBuffer(queue_->queue, data_, CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), buffer, 0, nullptr, nullptr);
-        PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data from the device buffer")
+        err = clEnqueueReadBuffer(queue_->queue, std::get<cl_mem>(data_), CL_TRUE, pos * sizeof(value_type), rcount * sizeof(value_type), buffer, 0, nullptr, nullptr);
     }
+    PLSSVM_OPENCL_ERROR_CHECK(err, "error copying the data from the device buffer")
     device_synchronize(*queue_);
 }
 
 template <typename T>
 void device_ptr<T>::copy_to_other_device(device_ptr &target, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != nullptr, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);

From 570ba776d2a80dbf35d0288b86cca824de1e4f50 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 9 Oct 2024 15:34:02 +0200
Subject: [PATCH 21/93] Fix usage of undefined type alias in assertion message.

---
 include/plssvm/backends/gpu_device_ptr.hpp    | 28 +++++++++----------
 .../backends/OpenCL/detail/device_ptr.cpp     | 14 +++++-----
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp
index d6e5045f1..64888a86e 100644
--- a/include/plssvm/backends/gpu_device_ptr.hpp
+++ b/include/plssvm/backends/gpu_device_ptr.hpp
@@ -430,14 +430,14 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::swap(
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::memset(const int pattern, const size_type pos) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->memset(pattern, pos, this->size_padded() * sizeof(value_type));
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::fill(const value_type value, const size_type pos) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->fill(value, pos, this->size_padded());
 }
@@ -445,7 +445,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::fill(
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 template <layout_type layout>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const matrix<value_type, layout> &data_to_copy) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (data_to_copy.size_padded() < this->size_padded()) {
         throw gpu_device_ptr_exception{ fmt::format("Too few data to perform copy (needed: {}, provided: {})!", this->size_padded(), data_to_copy.size_padded()) };
@@ -455,14 +455,14 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const std::vector<value_type> &data_to_copy) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->copy_to_device(data_to_copy, 0, this->size_padded());
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const std::vector<value_type> &data_to_copy, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
     if (data_to_copy.size() < rcount) {
@@ -473,7 +473,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device(const_host_pointer_type data_to_copy) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     this->copy_to_device(data_to_copy, 0, this->size_padded());
@@ -482,7 +482,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 template <layout_type layout>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device_strided(const matrix<value_type, layout> &data_to_copy, const std::size_t start_row, const std::size_t num_rows) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (start_row + num_rows > data_to_copy.num_rows()) {
         throw gpu_device_ptr_exception{ fmt::format("Tried to copy lines {}-{} (zero-based index) to the device, but the matrix has only {} lines!", start_row, start_row + num_rows - 1, data_to_copy.num_rows()) };
@@ -508,7 +508,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_device_strided(const std::vector<value_type> &data_to_copy, std::size_t spitch, std::size_t width, std::size_t height) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (width > spitch) {
         throw gpu_device_ptr_exception{ fmt::format("Invalid width and spitch combination specified (width: {} <= spitch: {})!", width, spitch) };
@@ -523,7 +523,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 template <layout_type layout>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(matrix<value_type, layout> &buffer) const {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (buffer.size_padded() < this->size_padded()) {
         throw gpu_device_ptr_exception{ fmt::format("Buffer too small to perform copy (needed: {}, provided: {})!", this->size_padded(), buffer.size_padded()) };
@@ -533,14 +533,14 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(std::vector<value_type> &buffer) const {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     this->copy_to_host(buffer, 0, this->size_padded());
 }
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(std::vector<value_type> &buffer, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
     if (buffer.size() < rcount) {
@@ -551,7 +551,7 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_host(host_pointer_type buffer) const {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
     this->copy_to_host(buffer, 0, this->size_padded());
@@ -559,8 +559,8 @@ void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_
 
 template <typename T, typename queue_t, typename device_pointer_t, typename derived_gpu_device_ptr>
 void gpu_device_ptr<T, queue_t, device_pointer_t, derived_gpu_device_ptr>::copy_to_other_device(derived_gpu_device_ptr &target) const {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
-    PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(target.get() != device_pointer_type{}, "Invalid target pointer! Maybe target has been default constructed?");
 
     this->copy_to_other_device(target, 0, this->size_padded());
 }
diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
index 6e417524b..d25e879e4 100644
--- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
+++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
@@ -84,7 +84,7 @@ device_ptr<T>::~device_ptr() {
 
 template <typename T>
 void device_ptr<T>::memset(const int pattern, const size_type pos, const size_type num_bytes) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (pos >= this->size_padded()) {
         throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) };
@@ -104,7 +104,7 @@ void device_ptr<T>::memset(const int pattern, const size_type pos, const size_ty
 
 template <typename T>
 void device_ptr<T>::fill(const value_type value, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
 
     if (pos >= this->size_padded()) {
         throw backend_exception{ fmt::format("Illegal access in fill!: {} >= {}", pos, this->size_padded()) };
@@ -125,7 +125,7 @@ void device_ptr<T>::fill(const value_type value, const size_type pos, const size
 
 template <typename T>
 void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const size_type pos, const size_type count) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
@@ -142,7 +142,7 @@ void device_ptr<T>::copy_to_device(const_host_pointer_type data_to_copy, const s
 
 template <typename T>
 void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy, const std::size_t spitch, const std::size_t width, const std::size_t height) {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(data_to_copy != nullptr, "Invalid host pointer for the data to copy!");
 
     if (width > spitch) {
@@ -180,7 +180,7 @@ void device_ptr<T>::copy_to_device_strided(const_host_pointer_type data_to_copy,
 
 template <typename T>
 void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
     PLSSVM_ASSERT(buffer != nullptr, "Invalid host pointer for the data to copy!");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
@@ -197,8 +197,8 @@ void device_ptr<T>::copy_to_host(host_pointer_type buffer, const size_type pos,
 
 template <typename T>
 void device_ptr<T>::copy_to_other_device(device_ptr &target, const size_type pos, const size_type count) const {
-    PLSSVM_ASSERT(data_ != device_ptr_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
-    PLSSVM_ASSERT(target.get() != nullptr, "Invalid target pointer! Maybe target has been default constructed?");
+    PLSSVM_ASSERT(data_ != device_pointer_type{}, "Invalid data pointer! Maybe *this has been default constructed?");
+    PLSSVM_ASSERT(target.get() != device_pointer_type{}, "Invalid target pointer! Maybe target has been default constructed?");
 
     const size_type rcount = std::min(count, this->size_padded() - pos);
     if (target.size_padded() < rcount) {

From 38c27fea12edac6ceae5070117a9eebeb73643e9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 9 Oct 2024 16:02:08 +0200
Subject: [PATCH 22/93] Update tests to support USM device_ptr and the
 cg_streaming solver.

---
 tests/backends/CUDA/detail/device_ptr.cpp     |  21 +-
 tests/backends/HIP/detail/device_ptr.hip      |  21 +-
 tests/backends/OpenCL/detail/device_ptr.cpp   |  24 +-
 tests/backends/OpenCL/detail/utility.cpp      |   4 +-
 .../SYCL/AdaptiveCpp/detail/device_ptr.cpp    |  21 +-
 .../backends/SYCL/DPCPP/detail/device_ptr.cpp |  21 +-
 tests/backends/generic_csvm_tests.hpp         |   5 +-
 tests/backends/generic_device_ptr_tests.hpp   | 271 +++++++++++-------
 tests/backends/generic_gpu_csvm_tests.hpp     |  78 ++++-
 tests/detail/cmd/parser_train.cpp             |   2 +-
 tests/solver_types.cpp                        |   5 +-
 tests/types_to_test.hpp                       |   2 +-
 12 files changed, 357 insertions(+), 118 deletions(-)

diff --git a/tests/backends/CUDA/detail/device_ptr.cpp b/tests/backends/CUDA/detail/device_ptr.cpp
index f97d0d8ab..52ba58ed5 100644
--- a/tests/backends/CUDA/detail/device_ptr.cpp
+++ b/tests/backends/CUDA/detail/device_ptr.cpp
@@ -18,10 +18,11 @@
 
 #include <tuple>  // std::tuple
 
-template <typename T>
+template <typename T, bool UUA>
 struct cuda_device_ptr_test_type {
     using device_ptr_type = plssvm::cuda::detail::device_ptr<T>;
     using queue_type = int;
+    static constexpr bool use_usm_allocations = UUA;
 
     static const queue_type &default_queue() {
         static const queue_type queue = 0;
@@ -29,7 +30,7 @@ struct cuda_device_ptr_test_type {
     }
 };
 
-using cuda_device_ptr_tuple = std::tuple<cuda_device_ptr_test_type<float>, cuda_device_ptr_test_type<double>>;
+using cuda_device_ptr_tuple = std::tuple<cuda_device_ptr_test_type<float, false>, cuda_device_ptr_test_type<double, false>>;
 
 // the tests used in the instantiated GTest test suites
 using cuda_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<cuda_device_ptr_tuple>>;
@@ -40,3 +41,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtr, DevicePtr, cuda_device_ptr_type_gt
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtr, DevicePtrLayout, cuda_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrDeathTest, DevicePtrDeathTest, cuda_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+using cuda_device_ptr_usm_tuple = std::tuple<cuda_device_ptr_test_type<float, true>, cuda_device_ptr_test_type<double, true>>;
+
+// the tests used in the instantiated GTest test suites
+using cuda_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<cuda_device_ptr_usm_tuple>>;
+using cuda_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<cuda_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrUSM, DevicePtr, cuda_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrUSM, DevicePtrLayout, cuda_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CUDADevicePtrUSMDeathTest, DevicePtrDeathTest, cuda_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/HIP/detail/device_ptr.hip b/tests/backends/HIP/detail/device_ptr.hip
index ecf8ce92a..09e9992de 100644
--- a/tests/backends/HIP/detail/device_ptr.hip
+++ b/tests/backends/HIP/detail/device_ptr.hip
@@ -18,10 +18,11 @@
 
 #include <tuple>  // std::tuple
 
-template <typename T>
+template <typename T, bool UUA>
 struct hip_device_ptr_test_type {
     using device_ptr_type = plssvm::hip::detail::device_ptr<T>;
     using queue_type = int;
+    constexpr static bool use_usm_allocations = UUA;
 
     static const queue_type &default_queue() {
         static const queue_type queue = 0;
@@ -29,7 +30,7 @@ struct hip_device_ptr_test_type {
     }
 };
 
-using hip_device_ptr_tuple = std::tuple<hip_device_ptr_test_type<float>, hip_device_ptr_test_type<double>>;
+using hip_device_ptr_tuple = std::tuple<hip_device_ptr_test_type<float, false>, hip_device_ptr_test_type<double, false>>;
 
 // the tests used in the instantiated GTest test suites
 using hip_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<hip_device_ptr_tuple>>;
@@ -40,3 +41,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtr, DevicePtr, hip_device_ptr_type_gtes
 INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtr, DevicePtrLayout, hip_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrDeathTest, DevicePtrDeathTest, hip_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+using hip_device_ptr_usm_tuple = std::tuple<hip_device_ptr_test_type<float, true>, hip_device_ptr_test_type<double, true>>;
+
+// the tests used in the instantiated GTest test suites
+using hip_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<hip_device_ptr_usm_tuple>>;
+using hip_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<hip_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrUSM, DevicePtr, hip_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrUSM, DevicePtrLayout, hip_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(HIPDevicePtrUSMDeathTest, DevicePtrDeathTest, hip_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/OpenCL/detail/device_ptr.cpp b/tests/backends/OpenCL/detail/device_ptr.cpp
index b9a638e04..0ea0f4e40 100644
--- a/tests/backends/OpenCL/detail/device_ptr.cpp
+++ b/tests/backends/OpenCL/detail/device_ptr.cpp
@@ -13,6 +13,7 @@
 #include "plssvm/backends/OpenCL/detail/command_queue.hpp"  // plssvm::opencl::detail::command_queue
 #include "plssvm/backends/OpenCL/detail/context.hpp"        // plssvm::opencl::detail::context
 #include "plssvm/backends/OpenCL/detail/utility.hpp"        // plssvm::opencl::detail::get_contexts
+#include "plssvm/target_platforms.hpp"                      // plssvm::target_platform
 
 #include "tests/backends/generic_device_ptr_tests.hpp"  // generic device pointer tests to instantiate
 #include "tests/naming.hpp"                             // naming::test_parameter_to_name
@@ -23,19 +24,20 @@
 #include <tuple>   // std::tuple
 #include <vector>  // std::vector
 
-template <typename T>
+template <typename T, bool UUA>
 struct opencl_device_ptr_test_type {
     using device_ptr_type = plssvm::opencl::detail::device_ptr<T>;
     using queue_type = plssvm::opencl::detail::command_queue;
+    constexpr static bool use_usm_allocations = UUA;
 
     static const queue_type &default_queue() {
         static const std::vector<plssvm::opencl::detail::context> contexts{ plssvm::opencl::detail::get_contexts(plssvm::target_platform::automatic).first };
-        static const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].devices[0] };
+        static const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].device };
         return queue;
     }
 };
 
-using opencl_device_ptr_tuple = std::tuple<opencl_device_ptr_test_type<float>, opencl_device_ptr_test_type<double>>;
+using opencl_device_ptr_tuple = std::tuple<opencl_device_ptr_test_type<float, false>, opencl_device_ptr_test_type<double, false>>;
 
 // the tests used in the instantiated GTest test suites
 using opencl_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<opencl_device_ptr_tuple>>;
@@ -46,3 +48,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtr, DevicePtr, opencl_device_ptr_typ
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtr, DevicePtrLayout, opencl_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrDeathTest, DevicePtrDeathTest, opencl_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+using opencl_device_ptr_usm_tuple = std::tuple<opencl_device_ptr_test_type<float, true>, opencl_device_ptr_test_type<double, true>>;
+
+// the tests used in the instantiated GTest test suites
+using opencl_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<opencl_device_ptr_usm_tuple>>;
+using opencl_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<opencl_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrUSM, DevicePtr, opencl_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrUSM, DevicePtrLayout, opencl_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(OpenCLDevicePtrUSMDeathTest, DevicePtrDeathTest, opencl_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/OpenCL/detail/utility.cpp b/tests/backends/OpenCL/detail/utility.cpp
index 3204d15a5..fe24e594a 100644
--- a/tests/backends/OpenCL/detail/utility.cpp
+++ b/tests/backends/OpenCL/detail/utility.cpp
@@ -91,7 +91,7 @@ TEST(OpenCLUtility, get_opencl_target_version) {
 TEST(OpenCLUtility, get_driver_version) {
     // create a valid command queue
     const std::vector<plssvm::opencl::detail::context> contexts{ plssvm::opencl::detail::get_contexts(plssvm::target_platform::automatic).first };
-    const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].devices[0] };
+    const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].device };
     // the device name should not be empty
     const std::string driver_version = plssvm::opencl::detail::get_driver_version(queue);
     EXPECT_FALSE(driver_version.empty());
@@ -100,7 +100,7 @@ TEST(OpenCLUtility, get_driver_version) {
 TEST(OpenCLUtility, get_device_name) {
     // create a valid command queue
     const std::vector<plssvm::opencl::detail::context> contexts{ plssvm::opencl::detail::get_contexts(plssvm::target_platform::automatic).first };
-    const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].devices[0] };
+    const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].device };
     // the device name should not be empty
     const std::string name = plssvm::opencl::detail::get_device_name(queue);
     EXPECT_FALSE(name.empty());
diff --git a/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp b/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
index 2b1f5f558..7bfb8cd43 100644
--- a/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
+++ b/tests/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
@@ -20,10 +20,11 @@
 
 #include <tuple>  // std::tuple
 
-template <typename T>
+template <typename T, bool UUA>
 struct adaptivecpp_device_ptr_test_type {
     using device_ptr_type = plssvm::adaptivecpp::detail::device_ptr<T>;
     using queue_type = typename device_ptr_type::queue_type;
+    constexpr static bool use_usm_allocations = UUA;
 
     static const queue_type &default_queue() {
         static const queue_type queue = plssvm::adaptivecpp::detail::get_default_queue();
@@ -31,7 +32,7 @@ struct adaptivecpp_device_ptr_test_type {
     }
 };
 
-using adaptivecpp_device_ptr_tuple = std::tuple<adaptivecpp_device_ptr_test_type<float>, adaptivecpp_device_ptr_test_type<double>>;
+using adaptivecpp_device_ptr_tuple = std::tuple<adaptivecpp_device_ptr_test_type<float, false>, adaptivecpp_device_ptr_test_type<double, false>>;
 
 // the tests used in the instantiated GTest test suites
 using adaptivecpp_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<adaptivecpp_device_ptr_tuple>>;
@@ -42,3 +43,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtr, DevicePtr, adaptivecpp_devi
 INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtr, DevicePtrLayout, adaptivecpp_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrDeathTest, DevicePtrDeathTest, adaptivecpp_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+using adaptivecpp_device_ptr_usm_tuple = std::tuple<adaptivecpp_device_ptr_test_type<float, true>, adaptivecpp_device_ptr_test_type<double, true>>;
+
+// the tests used in the instantiated GTest test suites
+using adaptivecpp_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<adaptivecpp_device_ptr_usm_tuple>>;
+using adaptivecpp_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<adaptivecpp_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrUSM, DevicePtr, adaptivecpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrUSM, DevicePtrLayout, adaptivecpp_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppDevicePtrUSMDeathTest, DevicePtrDeathTest, adaptivecpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp b/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp
index 06722fc3f..afbc9cd1b 100644
--- a/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp
+++ b/tests/backends/SYCL/DPCPP/detail/device_ptr.cpp
@@ -20,10 +20,11 @@
 
 #include <tuple>  // std::tuple
 
-template <typename T>
+template <typename T, bool UUA>
 struct dpcpp_device_ptr_test_type {
     using device_ptr_type = plssvm::dpcpp::detail::device_ptr<T>;
     using queue_type = typename device_ptr_type::queue_type;
+    constexpr static bool use_usm_allocations = UUA;
 
     static const queue_type &default_queue() {
         static const queue_type queue = plssvm::dpcpp::detail::get_default_queue();
@@ -31,7 +32,7 @@ struct dpcpp_device_ptr_test_type {
     }
 };
 
-using dpcpp_device_ptr_tuple = std::tuple<dpcpp_device_ptr_test_type<float>, dpcpp_device_ptr_test_type<double>>;
+using dpcpp_device_ptr_tuple = std::tuple<dpcpp_device_ptr_test_type<float, false>, dpcpp_device_ptr_test_type<double, false>>;
 
 // the tests used in the instantiated GTest test suites
 using dpcpp_device_ptr_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<dpcpp_device_ptr_tuple>>;
@@ -42,3 +43,19 @@ INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtr, DevicePtr, dpcpp_device_ptr_type_
 INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtr, DevicePtrLayout, dpcpp_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrDeathTest, DevicePtrDeathTest, dpcpp_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+using dpcpp_device_ptr_usm_tuple = std::tuple<dpcpp_device_ptr_test_type<float, true>, dpcpp_device_ptr_test_type<double, true>>;
+
+// the tests used in the instantiated GTest test suites
+using dpcpp_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<dpcpp_device_ptr_usm_tuple>>;
+using dpcpp_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<dpcpp_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrUSM, DevicePtr, dpcpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrUSM, DevicePtrLayout, dpcpp_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPDevicePtrUSMDeathTest, DevicePtrDeathTest, dpcpp_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
diff --git a/tests/backends/generic_csvm_tests.hpp b/tests/backends/generic_csvm_tests.hpp
index 562785728..4cfac24bb 100644
--- a/tests/backends/generic_csvm_tests.hpp
+++ b/tests/backends/generic_csvm_tests.hpp
@@ -194,6 +194,7 @@ template <typename csvm_type, typename device_ptr_type, typename matrix_type, ty
                 return result;  // dummy return only necessary for the DeathTests -> VALUE NOT USED!
             }
         case plssvm::solver_type::cg_explicit:
+        case plssvm::solver_type::cg_streaming:
             // no additional arguments are used
             return init_explicit_matrices<csvm_type, device_ptr_type>(std::move(matr), csvm);
         case plssvm::solver_type::cg_implicit:
@@ -900,7 +901,7 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) {
 #else
         SUCCEED() << "Solver type is automatic, but assertions are disabled!";
 #endif
-    } else if constexpr (solver == plssvm::solver_type::cg_explicit) {
+    } else if constexpr (solver == plssvm::solver_type::cg_explicit || solver == plssvm::solver_type::cg_streaming) {
         // run the assemble the kernel matrix kernels
         const std::vector<plssvm::detail::move_only_any> kernel_matrix_d = svm.assemble_kernel_matrix(solver, params, data, q_red, QA_cost);
         ASSERT_EQ(kernel_matrix_d.size(), num_devices);
@@ -1010,7 +1011,7 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) {
 #else
         SUCCEED() << "Solver type is automatic, but assertions are disabled!";
 #endif
-    } else if constexpr (solver == plssvm::solver_type::cg_explicit) {
+    } else if constexpr (solver == plssvm::solver_type::cg_explicit || solver == plssvm::solver_type::cg_streaming) {
         // run the assemble the kernel matrix kernels
         const std::vector<plssvm::detail::move_only_any> kernel_matrix_d = svm.assemble_kernel_matrix(solver, params, data, q_red, QA_cost);
         ASSERT_EQ(kernel_matrix_d.size(), num_devices);
diff --git a/tests/backends/generic_device_ptr_tests.hpp b/tests/backends/generic_device_ptr_tests.hpp
index 6a8713dc7..2f142d49d 100644
--- a/tests/backends/generic_device_ptr_tests.hpp
+++ b/tests/backends/generic_device_ptr_tests.hpp
@@ -40,13 +40,14 @@ TYPED_TEST_SUITE_P(DevicePtr);
 TYPED_TEST_P(DevicePtr, default_construct) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
 
     // default construct device_ptr
     const device_ptr_type ptr{};
 
     // empty data
     EXPECT_FALSE(static_cast<bool>(ptr));
-    EXPECT_EQ(ptr.get(), nullptr);
+    EXPECT_EQ(ptr.get(), data_ptr_type{});
     EXPECT_EQ(ptr.size(), 0);
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 0, 0 }));
     EXPECT_TRUE(ptr.empty());
@@ -55,15 +56,17 @@ TYPED_TEST_P(DevicePtr, default_construct) {
 TYPED_TEST_P(DevicePtr, construct_size) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr{ 42, queue };
+    const device_ptr_type ptr{ 42, queue, use_usm_allocations };
 
     // check data
     EXPECT_TRUE(static_cast<bool>(ptr));
-    EXPECT_NE(ptr.get(), nullptr);
+    EXPECT_NE(ptr.get(), data_ptr_type{});
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 0, 0 }));
@@ -73,15 +76,17 @@ TYPED_TEST_P(DevicePtr, construct_size) {
 TYPED_TEST_P(DevicePtr, construct_shape) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
 
     // check data
     EXPECT_TRUE(static_cast<bool>(ptr));
-    EXPECT_NE(ptr.get(), nullptr);
+    EXPECT_NE(ptr.get(), data_ptr_type{});
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 16 }));
     // check padding
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 0, 0 }));
@@ -91,15 +96,17 @@ TYPED_TEST_P(DevicePtr, construct_shape) {
 TYPED_TEST_P(DevicePtr, construct_shape_and_padding) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 4 }, queue };
+    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 4 }, queue, use_usm_allocations };
 
     // check data
     EXPECT_TRUE(static_cast<bool>(ptr));
-    EXPECT_NE(ptr.get(), nullptr);
+    EXPECT_NE(ptr.get(), data_ptr_type{});
     EXPECT_EQ(ptr.shape(), (plssvm::shape{ 42, 16 }));
     // check padding
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 4, 4 }));
@@ -109,17 +116,19 @@ TYPED_TEST_P(DevicePtr, construct_shape_and_padding) {
 TYPED_TEST_P(DevicePtr, move_construct) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type first{ 42, queue };
+    device_ptr_type first{ 42, queue, use_usm_allocations };
     const device_ptr_type second{ std::move(first) };
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
     // EXPECT_EQ(second.queue(), queue);
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
@@ -127,7 +136,7 @@ TYPED_TEST_P(DevicePtr, move_construct) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -137,17 +146,19 @@ TYPED_TEST_P(DevicePtr, move_construct) {
 TYPED_TEST_P(DevicePtr, move_construct_with_padding) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue };
+    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
     const device_ptr_type second{ std::move(first) };
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
     // EXPECT_EQ(second.queue(), queue);
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
@@ -155,7 +166,7 @@ TYPED_TEST_P(DevicePtr, move_construct_with_padding) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -165,11 +176,13 @@ TYPED_TEST_P(DevicePtr, move_construct_with_padding) {
 TYPED_TEST_P(DevicePtr, move_assign) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type first{ 42, queue };
+    device_ptr_type first{ 42, queue, use_usm_allocations };
     device_ptr_type second;
 
     // move assign
@@ -177,7 +190,7 @@ TYPED_TEST_P(DevicePtr, move_assign) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
@@ -185,7 +198,7 @@ TYPED_TEST_P(DevicePtr, move_assign) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -195,11 +208,13 @@ TYPED_TEST_P(DevicePtr, move_assign) {
 TYPED_TEST_P(DevicePtr, move_assign_with_padding) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue };
+    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
     device_ptr_type second;
 
     // move assign
@@ -207,7 +222,7 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
@@ -215,7 +230,7 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) {
 
     // check moved-from data
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -225,11 +240,13 @@ TYPED_TEST_P(DevicePtr, move_assign_with_padding) {
 TYPED_TEST_P(DevicePtr, swap_member_function) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct two device_ptr
-    device_ptr_type first{ 42, queue };
+    device_ptr_type first{ 42, queue, use_usm_allocations };
     device_ptr_type second{};
 
     // swap both device_ptr using the member function
@@ -237,14 +254,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 42, 1 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -254,11 +271,13 @@ TYPED_TEST_P(DevicePtr, swap_member_function) {
 TYPED_TEST_P(DevicePtr, swap_member_function_with_padding) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct two device_ptr
-    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue };
+    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
     device_ptr_type second{};
 
     // swap both device_ptr using the member function
@@ -266,14 +285,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function_with_padding) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 46, 15 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -283,12 +302,14 @@ TYPED_TEST_P(DevicePtr, swap_member_function_with_padding) {
 TYPED_TEST_P(DevicePtr, swap_free_function) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct two device_ptr
-    device_ptr_type first{ 42, queue };
-    device_ptr_type second;
+    device_ptr_type first{ 42, queue, use_usm_allocations };
+    device_ptr_type second{};
 
     // swap both device_ptr using the free function
     using std::swap;
@@ -296,14 +317,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 1 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 0, 0 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 42, 1 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -313,12 +334,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function) {
 TYPED_TEST_P(DevicePtr, swap_free_function_with_padding) {
     using test_type = typename TestFixture::fixture_test_type;
     using device_ptr_type = typename test_type::device_ptr_type;
+    using data_ptr_type = typename device_ptr_type::device_pointer_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct two device_ptr
-    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue };
-    device_ptr_type second;
+    device_ptr_type first{ plssvm::shape{ 42, 10 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
+    device_ptr_type second{};
 
     // swap both device_ptr using the free function
     using std::swap;
@@ -326,14 +349,14 @@ TYPED_TEST_P(DevicePtr, swap_free_function_with_padding) {
 
     // check data
     EXPECT_TRUE(static_cast<bool>(second));
-    EXPECT_NE(second.get(), nullptr);
+    EXPECT_NE(second.get(), data_ptr_type{});
     EXPECT_EQ(second.shape(), (plssvm::shape{ 42, 10 }));
     // check padding
     EXPECT_EQ(second.padding(), (plssvm::shape{ 4, 5 }));
     EXPECT_EQ(second.shape_padded(), (plssvm::shape{ 46, 15 }));
 
     EXPECT_FALSE(static_cast<bool>(first));
-    EXPECT_EQ(first.get(), nullptr);
+    EXPECT_EQ(first.get(), data_ptr_type{});
     EXPECT_EQ(first.shape(), (plssvm::shape{ 0, 0 }));
     // check padding
     EXPECT_EQ(first.padding(), (plssvm::shape{ 0, 0 }));
@@ -345,9 +368,10 @@ TYPED_TEST_P(DevicePtr, operator_bool) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_TRUE(static_cast<bool>(ptr1));
 
     // construct empty device_ptr
@@ -360,17 +384,18 @@ TYPED_TEST_P(DevicePtr, size) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_EQ(ptr1.size(), 42);
 
     // construct device_ptr with shape
-    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr2.size(), 42 * 16);
 
     // construct device_ptr with shape and padding
-    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr3.size(), 42 * 16);
 
     // construct empty device_ptr
@@ -383,17 +408,18 @@ TYPED_TEST_P(DevicePtr, shape) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_EQ(ptr1.shape(), (plssvm::shape{ 42, 1 }));
 
     // construct device_ptr with shape
-    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr2.shape(), (plssvm::shape{ 42, 16 }));
 
     // construct device_ptr with shape and padding
-    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr3.shape(), (plssvm::shape{ 42, 16 }));
 
     // construct empty device_ptr
@@ -406,21 +432,22 @@ TYPED_TEST_P(DevicePtr, empty) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_FALSE(ptr1.empty());
 
     // construct device_ptr
-    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
     EXPECT_FALSE(ptr2.empty());
 
     // construct device_ptr
-    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_FALSE(ptr3.empty());
 
     // construct device_ptr
-    const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_TRUE(ptr4.empty());
 
     // construct empty device_ptr
@@ -433,9 +460,10 @@ TYPED_TEST_P(DevicePtr, padding) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue };
+    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr.padding(), (plssvm::shape{ 4, 5 }));
     ;
 }
@@ -445,9 +473,10 @@ TYPED_TEST_P(DevicePtr, size_padded) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue };
+    const device_ptr_type ptr{ plssvm::shape{ 42, 16 }, plssvm::shape{ 4, 5 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr.size_padded(), (42 + 4) * (16 + 5));
 }
 
@@ -456,17 +485,18 @@ TYPED_TEST_P(DevicePtr, shape_padded) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_EQ(ptr1.shape_padded(), (plssvm::shape{ 42, 1 }));
 
     // construct device_ptr with shape
-    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr2.shape_padded(), (plssvm::shape{ 42, 16 }));
 
     // construct device_ptr with shape and padding
-    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_EQ(ptr3.shape_padded(), (plssvm::shape{ 45, 19 }));
 
     // construct empty device_ptr
@@ -479,25 +509,26 @@ TYPED_TEST_P(DevicePtr, is_padded) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    const device_ptr_type ptr1{ 42, queue };
+    const device_ptr_type ptr1{ 42, queue, use_usm_allocations };
     EXPECT_FALSE(ptr1.is_padded());
 
     // construct device_ptr
-    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue };
+    const device_ptr_type ptr2{ plssvm::shape{ 42, 16 }, queue, use_usm_allocations };
     EXPECT_FALSE(ptr2.is_padded());
 
     // construct device_ptr
-    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr3{ plssvm::shape{ 42, 16 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_TRUE(ptr3.is_padded());
 
     // construct device_ptr
-    const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue };
+    const device_ptr_type ptr4{ plssvm::shape{ 0, 0 }, plssvm::shape{ 3, 3 }, queue, use_usm_allocations };
     EXPECT_TRUE(ptr4.is_padded());
 
     // construct device_ptr
-    const device_ptr_type ptr5{ plssvm::shape{ 42, 16 }, plssvm::shape{ 0, 0 }, queue };
+    const device_ptr_type ptr5{ plssvm::shape{ 42, 16 }, plssvm::shape{ 0, 0 }, queue, use_usm_allocations };
     EXPECT_FALSE(ptr5.is_padded());
 
     // construct empty device_ptr
@@ -511,9 +542,10 @@ TYPED_TEST_P(DevicePtr, memset) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // memset values to all ones
     ptr.memset(1, 2);
@@ -534,9 +566,10 @@ TYPED_TEST_P(DevicePtr, memset_with_numbytes) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // memset values to all ones
     ptr.memset(1, 2, 4 * sizeof(value_type));
@@ -556,9 +589,10 @@ TYPED_TEST_P(DevicePtr, memset_invalid_pos) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // perform invalid memset
     EXPECT_THROW_WHAT(ptr.memset(0, 10, 1),
@@ -572,9 +606,10 @@ TYPED_TEST_P(DevicePtr, fill) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // fill values with a specific value
     ptr.fill(value_type{ 42.0 }, 2);
@@ -595,9 +630,10 @@ TYPED_TEST_P(DevicePtr, fill_with_count) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // fill values with a specific value
     ptr.fill(value_type{ 42.0 }, 2, 4);
@@ -618,9 +654,10 @@ TYPED_TEST_P(DevicePtr, fill_invalid_pos) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // perform invalid fill
     EXPECT_THROW_WHAT(ptr.fill(value_type{ 42.0 }, 10, 1),
@@ -634,9 +671,10 @@ TYPED_TEST_P(DevicePtr, copy_vector) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(14, 42);
@@ -657,9 +695,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_copy_back_all) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -680,9 +719,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_copy_back_some) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -703,9 +743,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_copy_to_too_many) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -726,9 +767,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_too_few_host_elements) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // try copying data to the device with too few elements
     std::vector<value_type> data(8, 42);
@@ -741,9 +783,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_too_few_buffer_elements) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // try copying data back to the host with a buffer with too few elements
     std::vector<value_type> buffer(8);
@@ -756,9 +799,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_too_few_host_elements) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // try copying data to the device with too few elements
     std::vector<value_type> data(4, 42);
@@ -771,9 +815,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_with_count_too_few_buffer_elements) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // try copying data back to the host with a buffer with too few elements
     std::vector<value_type> buffer(4);
@@ -786,9 +831,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_strided) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(20);  // 5 x 4
@@ -811,9 +857,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_strided_invalid_spitch_width_combination) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(20);  // 5 x 4
@@ -828,9 +875,10 @@ TYPED_TEST_P(DevicePtr, copy_vector_strided_submatrix_too_big) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(20);  // 5 x 4
@@ -845,9 +893,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(14, 42);
@@ -868,9 +917,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_with_count_copy_back_all) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -891,9 +941,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_with_count_copy_back_some) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -914,9 +965,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_with_count_copy_to_too_many) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 6, queue };
+    device_ptr_type ptr{ 6, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(6, 42);
@@ -937,9 +989,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_strided) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(20);  // 5 x 4
@@ -962,9 +1015,10 @@ TYPED_TEST_P(DevicePtr, copy_ptr_strided_invalid_spitch_width_combination) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 2 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(20);  // 5 x 4
@@ -979,9 +1033,10 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(14, 42);
@@ -990,7 +1045,7 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device) {
     ptr.copy_to_device(data);
 
     // other device_ptr
-    device_ptr_type other_ptr{ 10, queue };
+    device_ptr_type other_ptr{ 10, queue, !use_usm_allocations };
     ptr.copy_to_other_device(other_ptr);
 
     // copy data back to the host
@@ -1007,9 +1062,10 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_too_few_device_elements)
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(14, 42);
@@ -1018,7 +1074,7 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_too_few_device_elements)
     ptr.copy_to_device(data);
 
     // other device_ptr
-    device_ptr_type other_ptr{ 5, queue };
+    device_ptr_type other_ptr{ 5, queue, !use_usm_allocations };
     EXPECT_THROW_WHAT(ptr.copy_to_other_device(other_ptr), plssvm::exception, "Buffer too small to perform copy (needed: 10, provided: 5)!");
 }
 
@@ -1028,9 +1084,10 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_with_count) {
     using value_type = typename device_ptr_type::value_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     std::vector<value_type> data(14, 42);
@@ -1039,7 +1096,7 @@ TYPED_TEST_P(DevicePtr, copy_device_ptr_to_other_device_with_count) {
     ptr.copy_to_device(data);
 
     // other device_ptr
-    device_ptr_type other_ptr{ 5, queue };
+    device_ptr_type other_ptr{ 5, queue, !use_usm_allocations };
     ptr.copy_to_other_device(other_ptr, 1, 5);
 
     // copy data back to the host
@@ -1113,9 +1170,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     const plssvm::matrix<value_type, layout> data{ plssvm::shape{ 5, 3 }, value_type{ 42 } };
@@ -1146,9 +1204,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_with_padding) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 5, 3 }, plssvm::shape{ 4, 4 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 5, 3 }, plssvm::shape{ 4, 4 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     const plssvm::matrix<value_type, layout> data{ plssvm::shape{ 5, 3 }, value_type{ 42 }, plssvm::shape{ 4, 4 } };
@@ -1171,9 +1230,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_different_layouts) {
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
     constexpr plssvm::layout_type other_layout = layout == plssvm::layout_type::aos ? plssvm::layout_type::soa : plssvm::layout_type::aos;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // create data to copy to the device
     const plssvm::matrix<value_type, layout> data{ plssvm::shape{ 5, 3 }, value_type{ 42 } };
@@ -1204,9 +1264,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_too_few_host_elements) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // try copying data to the device with too few elements
     plssvm::matrix<value_type, layout> data{ plssvm::shape{ 2, 4 }, value_type{ 42 } };
@@ -1220,9 +1281,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_too_few_buffer_elements) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // try copying data back to the host with a buffer with too few elements
     plssvm::matrix<value_type, layout> buffer{ plssvm::shape{ 2, 4 } };
@@ -1236,9 +1298,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape(2, 3), queue };
+    device_ptr_type ptr{ plssvm::shape(2, 3), queue, use_usm_allocations };
 
     // create data to copy to the device
     const auto data = util::generate_specific_matrix<plssvm::matrix<value_type, layout>>(plssvm::shape{ 5, 3 });
@@ -1261,9 +1324,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_with_padding) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 3, 3 }, plssvm::shape{ 4, 4 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 3, 3 }, plssvm::shape{ 4, 4 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     const auto data = util::generate_specific_matrix<plssvm::matrix<value_type, layout>>(plssvm::shape{ 5, 3 }, plssvm::shape{ 4, 4 });
@@ -1291,9 +1355,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_different_layouts) {
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
     constexpr plssvm::layout_type other_layout = layout == plssvm::layout_type::aos ? plssvm::layout_type::soa : plssvm::layout_type::aos;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 3 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 3 }, queue, use_usm_allocations };
 
     // create data to copy to the device
     const auto data = util::generate_specific_matrix<plssvm::matrix<value_type, layout>>(plssvm::shape{ 5, 3 });
@@ -1325,9 +1390,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_full_matrix_strided) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape(5, 3), queue };
+    device_ptr_type ptr{ plssvm::shape(5, 3), queue, use_usm_allocations };
 
     // create data to copy to the device
     const auto data = util::generate_specific_matrix<plssvm::matrix<value_type, layout>>(plssvm::shape{ 5, 3 });
@@ -1349,9 +1415,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_too_few_host_elements) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue, use_usm_allocations };
 
     // try copying data to the device with too few elements
     plssvm::matrix<value_type, layout> data{ plssvm::shape{ 2, 4 }, value_type{ 42 } };
@@ -1365,9 +1432,10 @@ TYPED_TEST_P(DevicePtrLayout, copy_matrix_strided_invalid_submatrix) {
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
     constexpr plssvm::layout_type layout = util::test_parameter_value_at_v<0, TypeParam>;
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue };
+    device_ptr_type ptr{ plssvm::shape{ 2, 5 }, queue, use_usm_allocations };
 
     // try copying data to the device with too few elements
     plssvm::matrix<value_type, layout> data{ plssvm::shape{ 4, 5 }, value_type{ 42 } };
@@ -1423,9 +1491,10 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_invalid_host_ptr) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // copy with invalid data pointer
     EXPECT_DEATH(ptr.copy_to_device(nullptr), ::testing::HasSubstr("Invalid host pointer for the data to copy!"));
@@ -1456,9 +1525,10 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_with_count_invalid_host_ptr) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // copy with invalid data pointer
     EXPECT_DEATH(ptr.copy_to_device(nullptr, 0, 10), ::testing::HasSubstr("Invalid host pointer for the data to copy!"));
@@ -1486,9 +1556,10 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_strided_invalid_host_ptr) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct device_ptr
-    device_ptr_type ptr{ 1, queue };
+    device_ptr_type ptr{ 1, queue, use_usm_allocations };
 
     // copy with invalid data pointer
     EXPECT_DEATH(ptr.copy_to_device_strided(nullptr, 0, 0, 0), ::testing::HasSubstr("Invalid host pointer for the data to copy!"));
@@ -1513,10 +1584,11 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_to_other_device_invalid_device_ptr) {
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct default device_ptr
     device_ptr_type def{};
-    device_ptr_type ptr{ 2, queue };
+    device_ptr_type ptr{ 2, queue, use_usm_allocations };
 
     // copy with invalid device pointer
     EXPECT_DEATH(def.copy_to_other_device(ptr), ::testing::HasSubstr("Invalid data pointer! Maybe *this has been default constructed?"));
@@ -1528,10 +1600,11 @@ TYPED_TEST_P(DevicePtrDeathTest, copy_to_other_device_with_count_invalid_device_
     using device_ptr_type = typename test_type::device_ptr_type;
     using queue_type = typename test_type::queue_type;
     const queue_type &queue = test_type::default_queue();
+    constexpr bool use_usm_allocations = test_type::use_usm_allocations;
 
     // construct default device_ptr
     device_ptr_type def{};
-    device_ptr_type ptr{ 10, queue };
+    device_ptr_type ptr{ 10, queue, use_usm_allocations };
 
     // copy with invalid device pointer
     EXPECT_DEATH(def.copy_to_other_device(ptr, 0, 10), ::testing::HasSubstr("Invalid data pointer! Maybe *this has been default constructed?"));
diff --git a/tests/backends/generic_gpu_csvm_tests.hpp b/tests/backends/generic_gpu_csvm_tests.hpp
index dea31b85c..dfd0b2bb4 100644
--- a/tests/backends/generic_gpu_csvm_tests.hpp
+++ b/tests/backends/generic_gpu_csvm_tests.hpp
@@ -409,7 +409,82 @@ TYPED_TEST_P(GenericGPUCSVMKernelFunction, run_assemble_kernel_matrix_explicit)
         const plssvm::detail::execution_range exec{ block, svm.get_max_work_group_size(device_id), grid, svm.get_max_grid_size(device_id) };
 
         // calculate the current part of the kernel matrix
-        const device_ptr_type kernel_matrix_d = svm.run_assemble_kernel_matrix_explicit(device_id, exec, params, data_d, q_red_d, QA_cost);
+        const device_ptr_type kernel_matrix_d = svm.run_assemble_kernel_matrix_explicit(device_id, exec, params, false, data_d, q_red_d, QA_cost);
+
+        // copy the kernel matrix back to the host
+        std::vector<plssvm::real_type> kernel_matrix(kernel_matrix_d.size());
+        kernel_matrix_d.copy_to_host(kernel_matrix);
+
+        // calculate ground truth
+        const std::vector<plssvm::real_type> correct_kernel_matrix = ground_truth::assemble_device_specific_kernel_matrix(params, data_matr, q_red, QA_cost, *svm.data_distribution_, device_id);
+
+        // check for correctness
+        ASSERT_EQ(kernel_matrix.size(), correct_kernel_matrix.size());
+        EXPECT_FLOATING_POINT_VECTOR_NEAR_EPS(kernel_matrix, correct_kernel_matrix, 1e6);
+    }
+}
+
+TYPED_TEST_P(GenericGPUCSVMKernelFunction, run_assemble_kernel_matrix_explicit_USM) {
+    using csvm_test_type = util::test_parameter_type_at_t<0, TypeParam>;
+    using mock_csvm_type = typename csvm_test_type::mock_csvm_type;
+    using device_ptr_type = typename csvm_test_type::device_ptr_type;
+    constexpr plssvm::kernel_function_type kernel = util::test_parameter_value_at_v<0, TypeParam>;
+
+    plssvm::parameter params{ plssvm::kernel_type = kernel };
+    if constexpr (kernel != plssvm::kernel_function_type::linear) {
+        params.gamma = plssvm::real_type{ 0.001 };
+    }
+    const plssvm::data_set data{ PLSSVM_TEST_FILE };
+    auto data_matr{ data.data() };
+    if constexpr (kernel == plssvm::kernel_function_type::chi_squared) {
+        // chi-squared is well-defined for non-negative values only
+        data_matr = util::matrix_abs(data_matr);
+    }
+
+    // create C-SVM: must be done using the mock class since the member function to test is private or protected
+    const mock_csvm_type svm = util::construct_from_tuple<mock_csvm_type>(params, csvm_test_type::additional_arguments);
+    const std::size_t num_devices = svm.num_available_devices();
+    // be sure to use the correct data distribution
+    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(data.num_data_points() - 1, num_devices);
+
+    // perform dimensional reduction
+    const auto [q_red, QA_cost] = ground_truth::perform_dimensional_reduction(params, data_matr);
+
+    for (std::size_t device_id = 0; device_id < num_devices; ++device_id) {
+        SCOPED_TRACE(fmt::format("device_id {} ({}/{})", device_id, device_id + 1, num_devices));
+
+        // check whether the current device is responsible for at least one data point!
+        if (svm.data_distribution_->place_specific_num_rows(device_id) == 0) {
+            continue;
+        }
+        auto &device = svm.devices_[device_id];
+
+        // upload complete A and q_red to each device
+        device_ptr_type data_d{ data_matr.shape(), data_matr.padding(), device };
+        data_d.copy_to_device(data_matr);
+
+        device_ptr_type q_red_d{ q_red.size() + plssvm::PADDING_SIZE, device };
+        q_red_d.copy_to_device(q_red, 0, q_red.size());
+
+        // kernel launch specific sizes
+        const unsigned long long num_rows_reduced = data_matr.shape().x;
+        const unsigned long long device_specific_num_rows = svm.data_distribution_->place_specific_num_rows(device_id);
+        const unsigned long long device_row_offset = svm.data_distribution_->place_row_offset(device_id);
+
+        // the block dimension is THREAD_BLOCK_SIZE x THREAD_BLOCK_SIZE
+        const plssvm::detail::dim_type block{ std::size_t{ plssvm::THREAD_BLOCK_SIZE }, std::size_t{ plssvm::THREAD_BLOCK_SIZE } };
+
+        // define the full execution grid
+        const plssvm::detail::dim_type grid{
+            static_cast<std::size_t>(std::ceil(static_cast<double>(num_rows_reduced - device_row_offset) / static_cast<double>(block.x * plssvm::INTERNAL_BLOCK_SIZE))),
+            static_cast<std::size_t>(std::ceil(static_cast<double>(device_specific_num_rows) / static_cast<double>(block.y * plssvm::INTERNAL_BLOCK_SIZE)))
+        };
+
+        // create the final execution range
+        const plssvm::detail::execution_range exec{ block, svm.get_max_work_group_size(device_id), grid, svm.get_max_grid_size(device_id) };
+
+        // calculate the current part of the kernel matrix
+        const device_ptr_type kernel_matrix_d = svm.run_assemble_kernel_matrix_explicit(device_id, exec, params, true, data_d, q_red_d, QA_cost);
 
         // copy the kernel matrix back to the host
         std::vector<plssvm::real_type> kernel_matrix(kernel_matrix_d.size());
@@ -606,6 +681,7 @@ TYPED_TEST_P(GenericGPUCSVMKernelFunction, run_predict_kernel) {
 
 REGISTER_TYPED_TEST_SUITE_P(GenericGPUCSVMKernelFunction,
                             run_assemble_kernel_matrix_explicit,
+                            run_assemble_kernel_matrix_explicit_USM,
                             run_assemble_kernel_matrix_implicit_blas_level_3,
                             run_predict_kernel);
 
diff --git a/tests/detail/cmd/parser_train.cpp b/tests/detail/cmd/parser_train.cpp
index ba1392d75..54ea0de43 100644
--- a/tests/detail/cmd/parser_train.cpp
+++ b/tests/detail/cmd/parser_train.cpp
@@ -383,7 +383,7 @@ TEST_P(ParserTrainSolver, parsing) {
 // clang-format off
 INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainSolver, ::testing::Combine(
                 ::testing::Values("-l", "--solver"),
-                ::testing::Values(plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_implicit)),
+                ::testing::Values(plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_streaming, plssvm::solver_type::cg_implicit)),
                 naming::pretty_print_parameter_flag_and_value<ParserTrainSolver>);
 // clang-format on
 
diff --git a/tests/solver_types.cpp b/tests/solver_types.cpp
index acf5a5464..f0fc68e03 100644
--- a/tests/solver_types.cpp
+++ b/tests/solver_types.cpp
@@ -22,12 +22,13 @@ TEST(SolverType, to_string) {
     // check conversions to std::string
     EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::automatic, "automatic");
     EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::cg_explicit, "cg_explicit");
+    EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::cg_streaming, "cg_streaming");
     EXPECT_CONVERSION_TO_STRING(plssvm::solver_type::cg_implicit, "cg_implicit");
 }
 
 TEST(SolverType, to_string_unknown) {
     // check conversions to std::string from unknown solver_type
-    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::solver_type>(3), "unknown");
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::solver_type>(4), "unknown");
 }
 
 // check whether the std::string -> plssvm::solver_type conversions are correct
@@ -39,6 +40,8 @@ TEST(SolverType, from_string) {
     EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::solver_type::automatic);
     EXPECT_CONVERSION_FROM_STRING("cg_explicit", plssvm::solver_type::cg_explicit);
     EXPECT_CONVERSION_FROM_STRING("CG_Explicit", plssvm::solver_type::cg_explicit);
+    EXPECT_CONVERSION_FROM_STRING("cg_streaming", plssvm::solver_type::cg_streaming);
+    EXPECT_CONVERSION_FROM_STRING("CG_Streaming", plssvm::solver_type::cg_streaming);
     EXPECT_CONVERSION_FROM_STRING("cg_implicit", plssvm::solver_type::cg_implicit);
     EXPECT_CONVERSION_FROM_STRING("CG_Implicit", plssvm::solver_type::cg_implicit);
 }
diff --git a/tests/types_to_test.hpp b/tests/types_to_test.hpp
index 44db342b3..f8f5fc4de 100644
--- a/tests/types_to_test.hpp
+++ b/tests/types_to_test.hpp
@@ -475,7 +475,7 @@ constexpr std::array<plssvm::classification_type, 2> classification_types_to_tes
 };
 /// A list of all available solver types.
 constexpr std::array<plssvm::solver_type, 4> solver_types_to_test = {
-    plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_implicit
+    plssvm::solver_type::automatic, plssvm::solver_type::cg_explicit, plssvm::solver_type::cg_streaming, plssvm::solver_type::cg_implicit
 };
 
 /// A list of all solver types.

From 91b75b36568f6dbf1720fbc1e48d82da9c679010 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 14 Oct 2024 16:13:05 +0200
Subject: [PATCH 23/93] Add missing data set size contribution.

---
 src/plssvm/detail/data_distribution.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/plssvm/detail/data_distribution.cpp b/src/plssvm/detail/data_distribution.cpp
index 016260389..9992d113d 100644
--- a/src/plssvm/detail/data_distribution.cpp
+++ b/src/plssvm/detail/data_distribution.cpp
@@ -205,7 +205,11 @@ std::pair<memory_size, std::vector<memory_size>> triangular_data_distribution::c
 
         // add up the individual sizes and report the memory size in BYTES
         // for streaming, the kernel matrix is on the host, while everything else is on the device
-        res.first += memory_size{ sizeof(real_type) * kernel_matrix_size };
+        res.first += memory_size{ sizeof(real_type) };
+        if (device_id == 0) {
+            // we also store the data set, q vector and BLAS matrices on the system
+            res.first += memory_size{ sizeof(real_type) * (data_set_size + q_red_size + blas_matrices_size) };
+        }
         res.second[device_id] = memory_size{ sizeof(real_type) * (q_red_size + std::max(data_set_size, blas_matrices_size)) };
     }
 

From 18691a50edf49837d76abe8b4197eb0f3d4363e5 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 24 May 2025 22:07:03 +0200
Subject: [PATCH 24/93] Improve performance (mainly on AMD GPUs) and change
 implementations slightly such that the backends are more similar.

---
 .../cg_explicit/kernel_matrix_assembly.cuh    | 118 +++++++-------
 .../kernel_matrix_assembly.hip.hpp            | 118 +++++++-------
 .../cg_explicit/kernel_matrix_assembly.hpp    |   2 +-
 .../cg_explicit/kernel_matrix_assembly.hpp    | 126 +++++++--------
 .../cg_explicit/kernel_matrix_assembly.cl     |  91 +++++------
 .../basic/kernel_matrix_assembly.hpp          |  55 +++----
 .../hierarchical/kernel_matrix_assembly.hpp   | 144 +++++++++---------
 .../scoped/kernel_matrix_assembly.hpp         | 135 ++++++++--------
 .../work_group/kernel_matrix_assembly.hpp     | 124 +++++++--------
 .../cg_explicit/kernel_matrix_assembly.hpp    |   2 +-
 include/plssvm/constants.hpp                  |   7 +-
 src/plssvm/backends/Kokkos/csvm.cpp           |  10 +-
 src/plssvm/backends/OpenCL/detail/utility.cpp |   4 +-
 .../detail/tracking/performance_tracker.cpp   |   4 +-
 14 files changed, 482 insertions(+), 458 deletions(-)

diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
index 8a766b7db..2a3eef5c4 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
@@ -14,20 +14,22 @@
 #pragma once
 
 #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh"  // plssvm::cuda::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::cuda::detail {
 
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
- * @param[out] kernel_matrix_d the calculated kernel matrix
- * @param[in] data_d the data points to calculate the kernel matrix from
+ * @param[out] kernel_matrix the calculated kernel matrix
+ * @param[in] data the data points to calculate the kernel matrix from
  * @param[in] num_rows the total number of data points (= total number of rows)
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] q the vector used in the dimensional reduction
  * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -37,80 +39,84 @@ namespace plssvm::cuda::detail {
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type *q, const real_type QA_cost, const real_type cost, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching data point features
+    __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
     if (blockIdx_x >= blockIdx_y) {
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-                const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-                data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-            }
-            __syncthreads();  // wait until all threads loaded their part of the data
-
-            // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+        {
+            // calculate the indices used in the current thread paying attention to coalesced memory accesses
+            const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
+            const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+                // load data into shared memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data points, pays attention to coalesced memory accesses
+                    const auto global_i_linear = device_row_offset + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_linear = device_row_offset + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the shared memory
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
+                }
+                __syncthreads();  // wait until all threads loaded their part of the data
+
+                // perform the feature reduction calculation
+                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
                     }
                 }
+                __syncthreads();  // wait until all threads performed their part of the calculations
             }
-            __syncthreads();  // wait until all threads performed their part of the calculations
         }
 
+        // calculate the indices used in the current thread
+        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                // calculate the indices to access the kernel matrix (the part stored on the current device)
-                const auto device_global_i = i + static_cast<unsigned long long>(internal_i);
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
+                // calculate the indices to access the global data points and wrt the current device
+                const auto device_global_i = i + static_cast<std::size_t>(internal_i);
+                const auto global_i = device_row_offset + device_global_i;
+                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
+                const auto global_j = device_row_offset + device_global_j;
 
                 // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
+                if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
                     real_type temp_ij = temp[internal_i][internal_j];
+                    // apply the final kernel function
                     temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j];
                     // apply the cost on the diagonal
                     if (global_i == global_j) {
                         temp_ij += cost;
                     }
-                    // update the kernel matrix
-                    kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ull) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij;
+                    // update the upper triangular kernel matrix
+                    kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
                 }
             }
         }
diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp
index 75a3cd9a5..f0e01f813 100644
--- a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp
@@ -14,23 +14,25 @@
 #pragma once
 
 #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp"  // plssvm::hip::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                     // plssvm::kernel_function_type
 
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::hip::detail {
 
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
- * @param[out] kernel_matrix_d the calculated kernel matrix
- * @param[in] data_d the data points to calculate the kernel matrix from
+ * @param[out] kernel_matrix the calculated kernel matrix
+ * @param[in] data the data points to calculate the kernel matrix from
  * @param[in] num_rows the total number of data points (= total number of rows)
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] q the vector used in the dimensional reduction
  * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -40,80 +42,84 @@ namespace plssvm::hip::detail {
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type *q, const real_type QA_cost, const real_type cost, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching data point features
+    __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a wavefront must progress further
     if (blockIdx_x >= blockIdx_y) {
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-                const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-                data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-            }
-            __syncthreads();  // wait until all threads loaded their part of the data
-
-            // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+        {
+            // calculate the indices used in the current thread paying attention to coalesced memory accesses
+            const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
+            const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+                // load data into shared memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data points, pays attention to coalesced memory accesses
+                    const auto global_i_linear = device_row_offset + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_linear = device_row_offset + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the shared memory
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
+                }
+                __syncthreads();  // wait until all threads loaded their part of the data
+
+                // perform the feature reduction calculation
+                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
                     }
                 }
+                __syncthreads();  // wait until all threads performed their part of the calculations
             }
-            __syncthreads();  // wait until all threads performed their part of the calculations
         }
 
+        // calculate the indices used in the current thread
+        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                // calculate the indices to access the kernel matrix (the part stored on the current device)
-                const auto device_global_i = i + static_cast<unsigned long long>(internal_i);
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
+                // calculate the indices to access the global data points and wrt the current device
+                const auto device_global_i = i + static_cast<std::size_t>(internal_i);
+                const auto global_i = device_row_offset + device_global_i;
+                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
+                const auto global_j = device_row_offset + device_global_j;
 
                 // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
+                if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
                     real_type temp_ij = temp[internal_i][internal_j];
+                    // apply the final kernel function
                     temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j];
                     // apply the cost on the diagonal
                     if (global_i == global_j) {
                         temp_ij += cost;
                     }
-                    // update the kernel matrix
-                    kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ull) - device_global_j * (device_global_j + 1ull) / 2ull + device_global_i] = temp_ij;
+                    // update the upper triangular kernel matrix
+                    kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
                 }
             }
         }
diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
index e575c6af2..af1d3c9e2 100644
--- a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -15,7 +15,7 @@
 #pragma once
 
 #include "plssvm/backends/HPX/kernel/kernel_functions.hpp"  // plssvm::hpx::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                             // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                             // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                 // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                                // plssvm::aos_matrix
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 8e42e8b41..2a83b311f 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
@@ -41,11 +41,11 @@ class device_kernel_assembly {
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
-     * @param[out] kernel_matrix_d the calculated kernel matrix
-     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[out] kernel_matrix the calculated kernel matrix
+     * @param[in] data the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] q the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -55,12 +55,12 @@ class device_kernel_assembly {
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(device_view_type<real_type> kernel_matrix_d, device_view_type<real_type> data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, device_view_type<real_type> q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
-        kernel_matrix_d_{ kernel_matrix_d },
-        data_d_{ data_d },
+    device_kernel_assembly(device_view_type<real_type> kernel_matrix, device_view_type<real_type> data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, device_view_type<real_type> q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
+        kernel_matrix_{ kernel_matrix_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         q_{ q },
         QA_cost_{ QA_cost },
@@ -78,80 +78,84 @@ class device_kernel_assembly {
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-        // calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
-        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
-        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-
-        // create the shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in block x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in block y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in block x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in block y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size is too large
+
+        // create two shared memory arrays used for caching data point features
+        constexpr std::size_t shmem_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
         real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_i_cache{ data_cache_ptr, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_j_cache{ data_cache_ptr + shmem_size, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
 
-        // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
+        // only calculate the upper triangular matrix -> can't use team.team_rank() since all threads in a team must progress further
         if (blockIdx_x >= blockIdx_y) {
             // create a thread private array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
-                // load data into shared memory
-                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                    const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
-                    data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
-                    data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
-                    data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
-                }
-                team.team_barrier();  // wait until all threads loaded their part of the data
-
-                // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i),
-                                                                                                    data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j));
+            {
+                // calculate the indices used in the current thread paying attention to coalesced memory accesses
+                const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
+                const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
+
+                // iterate over all features using blocking to be able to cache them for faster memory accesses
+                for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) {
+                    // load data into shared memory
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        // calculate the indices to access the global data points, pays attention to coalesced memory accesses
+                        const auto global_i_linear = device_row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_linear = device_row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // store the values in the shared memory
+                        data_i_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
+                        data_j_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
+                    }
+                    team.team_barrier();  // wait until all threads loaded their part of the data
+
+                    // perform the feature reduction calculation
+                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                                        data_j_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j));
+                            }
                         }
                     }
+                    team.team_barrier();  // wait until all threads performed their part of the calculations
                 }
-                team.team_barrier();  // wait until all threads performed their part of the calculations
             }
 
+            // calculate the indices used in the current thread
+            const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+            const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    // calculate the indices to access the kernel matrix (the part stored on the current device)
+                    // calculate the indices to access the global data points and wrt the current device
                     const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
+                    const auto global_i = device_row_offset + device_global_i;
                     const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+                    const auto global_j = device_row_offset + device_global_j;
 
                     // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                    if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
                         real_type temp_ij = temp[internal_i][internal_j];
+                        // apply the final kernel function
                         temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
                         // apply the cost on the diagonal
                         if (global_i == global_j) {
                             temp_ij += cost_;
                         }
-                        // update the kernel matrix
-                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                        // update the upper triangular kernel matrix
+                        kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
                     }
                 }
             }
@@ -160,11 +164,11 @@ class device_kernel_assembly {
 
   private:
     /// @cond Doxygen_suppress
-    device_view_type<real_type> kernel_matrix_d_;
-    device_view_type<const real_type> data_d_;
+    device_view_type<real_type> kernel_matrix_;
+    device_view_type<const real_type> data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     device_view_type<const real_type> q_;
     const real_type QA_cost_;
diff --git a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl
index 481945ca6..99bc02933 100644
--- a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl
+++ b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl
@@ -14,11 +14,11 @@
 /**
  * @brief Create the explicit kernel matrix using the kernel function determined at runtime.
  * @details The `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER`, `PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION`, and `PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION` placeholder will be replaced by the correct values upon kernel construction.
- * @param[out] kernel_matrix_d the calculated kernel matrix
- * @param[in] data_d the data points to calculate the kernel matrix from
+ * @param[out] kernel_matrix the calculated kernel matrix
+ * @param[in] data the data points to calculate the kernel matrix from
  * @param[in] num_rows the total number of data points (= total number of rows)
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] q the vector used in the dimensional reduction
  * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -27,78 +27,83 @@
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen)
  */
-__kernel void device_kernel_assembly(__global real_type *kernel_matrix_d, const __global real_type *data_d, const ulong num_rows, const ulong device_num_rows, const ulong row_offset, const ulong num_features, const __global real_type *q, const real_type QA_cost, const real_type cost, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
+__kernel void device_kernel_assembly(__global real_type *kernel_matrix, const __global real_type *data, const ulong num_rows, const ulong device_num_rows, const ulong device_row_offset, const ulong num_features, const __global real_type *q, const real_type QA_cost, const real_type cost, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-    // calculate the indices used in the current thread
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
     // create the local memory arrays used for caching data point features
-    __local real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
     if (blockIdx_x >= blockIdx_y) {
-        // create a thread private array used for internal caching
+        // create a private memory array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) {
-            // load data into local memory
-            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-                const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
+        {
+            // calculate the indices used in the current work-item paying attention to coalesced memory accesses
+            const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
+            const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
 
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                data_cache_i[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i];
-                data_cache_i[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i];
-                data_cache_j[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j];
-                data_cache_j[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j];
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (ulong dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_ul) {
+                // load data into local memory
+                for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data points, pays attention to coalesced memory accesses
+                    const ulong global_i_linear = device_row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
+                    const ulong global_j_linear = device_row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
 
-            // perform the feature reduction calculation
-            for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_i[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_cache_j[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                    // store the values in the local memory
+                    data_i_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i_linear];
+                    data_j_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j_linear];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+
+                // perform the feature reduction calculation
+                for (uint block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                    for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
                     }
                 }
+                barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
             }
-            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
 
+        // calculate the indices used in the current work-item
+        const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
+        const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;
+
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                // calculate the indices to access the global data points and wrt the current device
                 const ulong device_global_i = i + (ulong) internal_i;
-                const ulong global_i = row_offset + i + (ulong) internal_i;
+                const ulong global_i = device_row_offset + device_global_i;
                 const ulong device_global_j = j + (ulong) internal_j;
-                const ulong global_j = row_offset + j + (ulong) internal_j;
+                const ulong global_j = device_row_offset + device_global_j;
 
                 // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if (device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
+                if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
                     real_type temp_ij = temp[internal_i][internal_j];
+                    // apply the final kernel function
                     temp_ij = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp_ij PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i] - q[global_j];
                     // apply the cost on the diagonal
                     if (global_i == global_j) {
                         temp_ij += cost;
                     }
-                    // update the kernel matrix
-                    kernel_matrix_d[device_global_j * (num_rows - row_offset + PADDING_SIZE_ul) - device_global_j * (device_global_j + (ulong) 1) / (ulong) 2 + device_global_i] = temp_ij;
+                    // update the upper triangular kernel matrix
+                    kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_ul) - device_global_j * (device_global_j + (ulong) 1) / (ulong) 2 + device_global_i] = temp_ij;
                 }
             }
         }
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
index 65587ddaa..22b24bae0 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
@@ -14,7 +14,7 @@
 #pragma once
 
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::item
@@ -35,11 +35,11 @@ class device_kernel_assembly {
   public:
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] kernel_matrix_d the calculated kernel matrix
-     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[out] kernel_matrix the calculated kernel matrix
+     * @param[in] data the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] q the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -48,12 +48,12 @@ class device_kernel_assembly {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        kernel_matrix_d_{ kernel_matrix_d },
-        data_d_{ data_d },
+    device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        kernel_matrix_{ kernel_matrix },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         q_{ q },
         QA_cost_{ QA_cost },
@@ -69,25 +69,27 @@ class device_kernel_assembly {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;
 
+        // only calculate the upper triangular matrix
         if (i >= j) {
-            // create a work-item private array used for internal caching
+            // create a private memory array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
             for (std::size_t dim = 0; dim < num_features_; ++dim) {
                 // perform the feature reduction calculation
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i],
-                                                                                                data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]);
+                        const auto global_i = device_row_offset_ + i + static_cast<std::size_t>(internal_i);
+                        const auto global_j = device_row_offset_ + j + static_cast<std::size_t>(internal_j);
+                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i],
+                                                                                                data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]);
                     }
                 }
             }
@@ -95,22 +97,23 @@ class device_kernel_assembly {
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    // calculate the indices to access the kernel matrix (the part stored on the current device)
+                    // calculate the indices to access the global data points and wrt the current device
                     const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
+                    const auto global_i = device_row_offset_ + device_global_i;
                     const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+                    const auto global_j = device_row_offset_ + device_global_j;
 
                     // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                    if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
                         real_type temp_ij = temp[internal_i][internal_j];
+                        // apply the final kernel function
                         temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
                         // apply the cost on the diagonal
                         if (global_i == global_j) {
                             temp_ij += cost_;
                         }
-                        // update the kernel matrix
-                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                        // update the upper triangular kernel matrix
+                        kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
                     }
                 }
             }
@@ -119,11 +122,11 @@ class device_kernel_assembly {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *kernel_matrix_d_;
-    const real_type *data_d_;
+    real_type *kernel_matrix_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type *q_;
     const real_type QA_cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
index b09fef0f8..d3e37ca54 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
@@ -14,11 +14,12 @@
 #pragma once
 
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
 
+#include <array>    // std::array
 #include <cstddef>  // std::size_t
 #include <tuple>    // std::tuple, std::make_tuple
 
@@ -35,11 +36,11 @@ class device_kernel_assembly {
   public:
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] kernel_matrix_d the calculated kernel matrix
-     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[out] kernel_matrix the calculated kernel matrix
+     * @param[in] data the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] q the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -48,12 +49,12 @@ class device_kernel_assembly {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        kernel_matrix_d_{ kernel_matrix_d },
-        data_d_{ data_d },
+    device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        kernel_matrix_{ kernel_matrix },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         q_{ q },
         QA_cost_{ QA_cost },
@@ -68,67 +69,47 @@ class device_kernel_assembly {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> i{ group };
-        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> j{ group };
-        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
-
-        ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
-
-        // initialize private and local variables
-        group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
-            // initialize private temp matrix to zero
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
-                }
-            }
-        });
+        // create two local memory arrays used for caching data point features
+        real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
-        // implicit group barrier
+        // create a private memory array used for internal caching
+        ::sycl::private_memory<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>, 2> temp{ group };
 
-        // exploit symmetry
+        // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
         if (group[1] >= group[0]) {
-            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
-                // load data into shared memory
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                // load data into local memory
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    const std::size_t threadIdx_x = idx.get_local_id(0);
+                    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                    const std::size_t threadIdx_x = idx.get_local_id(0);       // current work-item in work-group x-dimension
+                    const std::size_t threadIdx_y = idx.get_local_id(1);       // current work-item in work-group y-dimension
+                    const std::size_t blockDim_x = idx.get_local_range(0);     // number of work-items in work-group x-dimension
+                    const std::size_t blockDim_y = idx.get_local_range(1);     // number of work-items in work-group y-dimension
+                    const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                    const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                    // calculate the indices used in the current work-item paying attention to coalesced memory accesses
+                    const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                    const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
 
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                        const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                        data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                        data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                        data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                        data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                        // calculate the indices to access the global data points, pays attention to coalesced memory accesses
+                        const auto global_i_linear = device_row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_linear = device_row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // store the values in the local memory
+                        data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
+                        data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
                     }
                 });
 
@@ -136,14 +117,15 @@ class device_kernel_assembly {
 
                 // perform the feature reduction calculation
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                             data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                             data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
                             }
                         }
                     }
@@ -154,26 +136,40 @@ class device_kernel_assembly {
 
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                const std::size_t threadIdx_x = idx.get_local_id(0);       // current work-item in work-group x-dimension
+                const std::size_t threadIdx_y = idx.get_local_id(1);       // current work-item in work-group y-dimension
+                const std::size_t blockDim_x = idx.get_local_range(0);     // number of work-items in work-group x-dimension
+                const std::size_t blockDim_y = idx.get_local_range(1);     // number of work-items in work-group y-dimension
+                const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+                // calculate the indices used in the current work-item
+                const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
 
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        // calculate the indices to access the kernel matrix (the part stored on the current device)
-                        const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                        const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                        const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                        const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+                        // calculate the indices to access the global data points and wrt the current device
+                        const auto device_global_i = i + static_cast<std::size_t>(internal_i);
+                        const auto global_i = device_row_offset_ + device_global_i;
+                        const auto device_global_j = j + static_cast<std::size_t>(internal_j);
+                        const auto global_j = device_row_offset_ + device_global_j;
 
                         // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                        if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                        if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
                             real_type temp_ij = temp(idx)[internal_i][internal_j];
+                            // apply the final kernel function
                             temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
                             // apply the cost on the diagonal
                             if (global_i == global_j) {
                                 temp_ij += cost_;
                             }
-                            // update the kernel matrix
-                            kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                            // update the upper triangular kernel matrix
+                            kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
                         }
                     }
                 }
@@ -183,11 +179,11 @@ class device_kernel_assembly {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *kernel_matrix_d_;
-    const real_type *data_d_;
+    real_type *kernel_matrix_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type *q_;
     const real_type QA_cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
index 4ed3764ce..33c725a46 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
@@ -14,11 +14,12 @@
 #pragma once
 
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
 
+#include <array>    // std::array
 #include <cstddef>  // std::size_t
 #include <tuple>    // std::tuple, std::make_tuple
 
@@ -35,11 +36,11 @@ class device_kernel_assembly {
   public:
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] kernel_matrix_d the calculated kernel matrix
-     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[out] kernel_matrix the calculated kernel matrix
+     * @param[in] data the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] q the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -48,12 +49,12 @@ class device_kernel_assembly {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        kernel_matrix_d_{ kernel_matrix_d },
-        data_d_{ data_d },
+    device_kernel_assembly(real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        kernel_matrix_{ kernel_matrix },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         q_{ q },
         QA_cost_{ QA_cost },
@@ -71,94 +72,100 @@ class device_kernel_assembly {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
-
-                                       // exploit symmetry
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),            // data_i_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),            // data_j_cache
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),  // temp
+                                   [&](auto &data_i_cache, auto &data_j_cache, auto &temp) {
+                                       // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
                                        if (group[1] >= group[0]) {
-                                           for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
-                                               // load data into shared memory
+                                           // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                           for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                               // load data into local memory
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                   // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                   constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                                   constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                   constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current work-item in work-group x-dimension
+                                                   const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current work-item in work-group y-dimension
+                                                   const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of work-items in work-group x-dimension
+                                                   const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of work-items in work-group y-dimension
+                                                   const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                                   const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                                   // calculate the indices used in the current work-item paying attention to coalesced memory accesses
+                                                   const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                                   const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
 
                                                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                       const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                       const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                                                       // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                                                       data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                                                       data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                                                       data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                                                       data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                                                       // calculate the indices to access the global data points, pays attention to coalesced memory accesses
+                                                       const auto global_i_linear = device_row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                       const auto global_j_linear = device_row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                                                       // store the values in the local memory
+                                                       data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
+                                                       data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
                                                    }
                                                });
 
-                                               // perform calculations
+                                               // perform the feature reduction calculation
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                                   for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                                                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                               temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                                                            data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                               temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                                            data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
                                                            }
                                                        }
                                                    }
                                                });
                                            }
 
+                                           // apply the remaining part of the kernel function and store the value in the output kernel matrix
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                               // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                               constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                               constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current work-item in work-group x-dimension
+                                               const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current work-item in work-group y-dimension
+                                               const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of work-items in work-group x-dimension
+                                               const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of work-items in work-group y-dimension
+                                               const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                               // calculate the indices used in the current work-item
+                                               const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                               const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
 
                                                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                       // calculate the indices to access the kernel matrix (the part stored on the current device)
-                                                       const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                                                       const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                                                       const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                                                       const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+                                                       // calculate the indices to access the global data points and wrt the current device
+                                                       const auto device_global_i = i + static_cast<std::size_t>(internal_i);
+                                                       const auto global_i = device_row_offset_ + device_global_i;
+                                                       const auto device_global_j = j + static_cast<std::size_t>(internal_j);
+                                                       const auto global_j = device_row_offset_ + device_global_j;
 
                                                        // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                                                       if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                                                       if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
                                                            real_type temp_ij = temp(idx)[internal_i][internal_j];
+                                                           // apply the final kernel function
                                                            temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
                                                            // apply the cost on the diagonal
                                                            if (global_i == global_j) {
                                                                temp_ij += cost_;
                                                            }
-                                                           // update the kernel matrix
-                                                           kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                                                           // update the upper triangular kernel matrix
+                                                           kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
                                                        }
                                                    }
                                                }
@@ -169,11 +176,11 @@ class device_kernel_assembly {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *kernel_matrix_d_;
-    const real_type *data_d_;
+    real_type *kernel_matrix_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type *q_;
     const real_type QA_cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
index 96030fbe7..6e7fd2033 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
@@ -14,7 +14,7 @@
 #pragma once
 
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
@@ -36,11 +36,11 @@ class device_kernel_assembly {
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
-     * @param[out] kernel_matrix_d the calculated kernel matrix
-     * @param[in] data_d the data points to calculate the kernel matrix from
+     * @param[out] kernel_matrix the calculated kernel matrix
+     * @param[in] data the data points to calculate the kernel matrix from
      * @param[in] num_rows the number of data points
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data_d the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] q the vector used in the dimensional reduction
      * @param[in] QA_cost the scalar used in the dimensional reduction
@@ -49,14 +49,14 @@ class device_kernel_assembly {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(::sycl::handler &cgh, real_type *kernel_matrix_d, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        data_cache_i_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        data_cache_j_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        kernel_matrix_d_{ kernel_matrix_d },
-        data_d_{ data_d },
+    device_kernel_assembly(::sycl::handler &cgh, real_type *kernel_matrix, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type *q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        data_i_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        data_j_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        kernel_matrix_{ kernel_matrix },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         q_{ q },
         QA_cost_{ QA_cost },
@@ -76,74 +76,78 @@ class device_kernel_assembly {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-        // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current work-item in work-group x-dimension
+        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current work-item in work-group y-dimension
+        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of work-items in work-group x-dimension
+        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of work-items in work-group y-dimension
+        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
         if (blockIdx_y >= blockIdx_x) {
-            // create a work-item private array used for internal caching
+            // create a private memory array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) {
-                // load data into local memory
-                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                    data_cache_i_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                    data_cache_i_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                    data_cache_j_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                    data_cache_j_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                }
-                nd_idx.barrier();  // wait until all work-items loaded their part of the data
-
-                // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                    data_cache_j_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+            {
+                // calculate the indices used in the current work-item paying attention to coalesced memory accesses
+                const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+
+                // iterate over all features using blocking to be able to cache them for faster memory accesses
+                for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) {
+                    // load data into local memory
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        // calculate the indices to access the global data points, pays attention to coalesced memory accesses
+                        const auto global_i_linear = device_row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_linear = device_row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // store the values in the local memory
+                        data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
+                        data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
+                    }
+                    nd_idx.barrier();  // wait until all work-items loaded their part of the data
+
+                    // perform the feature reduction calculation
+                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                        data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                            }
                         }
                     }
+                    nd_idx.barrier();  // wait until all work-items performed their part of the calculations
                 }
-                nd_idx.barrier();  // wait until all work-items performed their part of the calculations
             }
 
+            // calculate the indices used in the current work-item
+            const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    // calculate the indices to access the kernel matrix (the part stored on the current device)
+                    // calculate the indices to access the global data points and wrt the current device
                     const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
+                    const auto global_i = device_row_offset_ + device_global_i;
                     const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+                    const auto global_j = device_row_offset_ + device_global_j;
 
                     // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                    if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
                         real_type temp_ij = temp[internal_i][internal_j];
+                        // apply the final kernel function
                         temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
                         // apply the cost on the diagonal
                         if (global_i == global_j) {
                             temp_ij += cost_;
                         }
-                        // update the kernel matrix
-                        kernel_matrix_d_[device_global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                        // update the upper triangular kernel matrix
+                        kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
                     }
                 }
             }
@@ -152,16 +156,16 @@ class device_kernel_assembly {
 
   private:
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_i_;
+    ::sycl::local_accessor<real_type, 2> data_i_cache_;
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_j_;
+    ::sycl::local_accessor<real_type, 2> data_j_cache_;
 
     /// @cond Doxygen_suppress
-    real_type *kernel_matrix_d_;
-    const real_type *data_d_;
+    real_type *kernel_matrix_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type *q_;
     const real_type QA_cost_;
diff --git a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 93772aab3..51e11a282 100644
--- a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -14,7 +14,7 @@
 #pragma once
 
 #include "plssvm/backends/stdpar/kernel/kernel_functions.hpp"  // plssvm::stdpar::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                            // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                                   // plssvm::aos_matrix
diff --git a/include/plssvm/constants.hpp b/include/plssvm/constants.hpp
index e99dbeddd..81d992991 100644
--- a/include/plssvm/constants.hpp
+++ b/include/plssvm/constants.hpp
@@ -38,11 +38,8 @@ constexpr unsigned INTERNAL_BLOCK_SIZE = PLSSVM_INTERNAL_BLOCK_SIZE;
 constexpr unsigned INTERNAL_BLOCK_SIZE = 4;
 #endif
 
-/// Global compile time constant used for internal feature caching.
-constexpr unsigned FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE;
-
-/// Padding used for the device w_d matrix to prevent out-of-bounce accesses without ifs.
-constexpr unsigned PADDING_SIZE = FEATURE_BLOCK_SIZE > (THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) ? FEATURE_BLOCK_SIZE : (THREAD_BLOCK_SIZE *INTERNAL_BLOCK_SIZE);
+/// Padding used for the device arrays and matrices to prevent out-of-bounce accesses without ifs.
+constexpr unsigned PADDING_SIZE = THREAD_BLOCK_SIZE *INTERNAL_BLOCK_SIZE;
 
 // perform sanity checks
 static_assert(detail::tuple_contains_v<real_type, detail::supported_real_types>, "Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types.");
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 2bf512433..e18c88328 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -20,7 +20,7 @@
 #include "plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::kokkos::detail::device_kernel_assembly
 #include "plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::kokkos::detail::device_kernel_assembly_symm
 #include "plssvm/backends/Kokkos/kernel/predict_kernel.hpp"                           // plssvm::kokkos::detail::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
-#include "plssvm/constants.hpp"                                                       // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE
+#include "plssvm/constants.hpp"                                                       // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
 #include "plssvm/detail/assert.hpp"                                                   // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/logging/log_untracked.hpp"                                    // plssvm::detail::log_untracked
@@ -414,7 +414,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
 
     device_ptr_type kernel_matrix_d{ num_entries_padded, devices_[device_id] };  // only explicitly store the upper triangular matrix
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
-    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
     // save the team size
     const int team_size = detail::dim_type_to_native(exec.block);
@@ -492,7 +492,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
         // get the offset of the data points this device is responsible for
         const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
         // the necessary amount of scratch memory for the kernels
-        const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+        const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
         // save the team size
         const int team_size = detail::dim_type_to_native(exec.block);
@@ -592,7 +592,7 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
         const unsigned long long row_offset = data_distribution_->place_row_offset(device_id);
 
         const real_type cost_factor = real_type{ 1.0 } / params.cost;
-        const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+        const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
         // save the team size
         const int team_size = detail::dim_type_to_native(exec.block);
@@ -702,7 +702,7 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
     device_ptr_type out_d{ shape{ num_predict_points, num_classes }, shape{ PADDING_SIZE, PADDING_SIZE }, devices_[device_id] };
 
-    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
+    const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
     // save the team size
     const int team_size = detail::dim_type_to_native(exec.block);
diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp
index 6b3f686ae..e3202bb6b 100644
--- a/src/plssvm/backends/OpenCL/detail/utility.cpp
+++ b/src/plssvm/backends/OpenCL/detail/utility.cpp
@@ -13,7 +13,7 @@
 #include "plssvm/backends/OpenCL/detail/error_code.hpp"     // plssvm::opencl::detail::error_code
 #include "plssvm/backends/OpenCL/detail/jit_info.hpp"       // plssvm::opencl::detail::jit_info
 #include "plssvm/backends/OpenCL/detail/kernel.hpp"         // plssvm::opencl::detail::compute_kernel_name, plssvm::opencl::detail::kernel
-#include "plssvm/constants.hpp"                             // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                             // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/arithmetic_type_name.hpp"           // plssvm::detail::arithmetic_type_name
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
 #include "plssvm/detail/logging/mpi_log_untracked.hpp"      // plssvm::detail::log_untracked
@@ -359,12 +359,10 @@ std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi:
     // replace constants in kernel_src_string
     // replace the size_t variants -> BEFORE replacing the "normal" values
     ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE_ul", fmt::format("(ulong) {}", THREAD_BLOCK_SIZE));
-    ::plssvm::detail::replace_all(kernel_src_string, "FEATURE_BLOCK_SIZE_ul", fmt::format("(ulong) {}", FEATURE_BLOCK_SIZE));
     ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE_ul", fmt::format("(ulong) {}", INTERNAL_BLOCK_SIZE));
     ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE_ul", fmt::format("(ulong) {}", PADDING_SIZE));
     // replace the normal variants
     ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE", fmt::format("{}", THREAD_BLOCK_SIZE));
-    ::plssvm::detail::replace_all(kernel_src_string, "FEATURE_BLOCK_SIZE", fmt::format("{}", FEATURE_BLOCK_SIZE));
     ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE", fmt::format("{}", INTERNAL_BLOCK_SIZE));
     ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE", fmt::format("{}", PADDING_SIZE));
 
diff --git a/src/plssvm/detail/tracking/performance_tracker.cpp b/src/plssvm/detail/tracking/performance_tracker.cpp
index 58b4e975a..8598367dc 100644
--- a/src/plssvm/detail/tracking/performance_tracker.cpp
+++ b/src/plssvm/detail/tracking/performance_tracker.cpp
@@ -8,7 +8,7 @@
 
 #include "plssvm/detail/tracking/performance_tracker.hpp"
 
-#include "plssvm/constants.hpp"                          // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::FEATURE_BLOCK_SIZE, plssvm::PADDING_SIZE
+#include "plssvm/constants.hpp"                          // plssvm::real_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE, plssvm::PADDING_SIZE
 #include "plssvm/detail/arithmetic_type_name.hpp"        // plssvm::detail::arithmetic_type_name
 #include "plssvm/detail/assert.hpp"                      // PLSSVM_ASSERT, PLSSVM_ASSERT_ENABLED
 #include "plssvm/detail/cmd/parser_predict.hpp"          // plssvm::detail::cmd::parser_predict
@@ -280,7 +280,6 @@ void performance_tracker::save(std::ostream &out) {
         "  asserts:                           {}\n"
         "  enforce_max_mem_alloc_size:        {}\n"
         "  THREAD_BLOCK_SIZE:                 {}\n"
-        "  FEATURE_BLOCK_SIZE:                {}\n"
         "  INTERNAL_BLOCK_SIZE:               {}\n"
         "  PADDING_SIZE:                      {}\n",
         plssvm::detail::current_date_time(),
@@ -295,7 +294,6 @@ void performance_tracker::save(std::ostream &out) {
         assert_enabled,
         enforce_max_mem_alloc_size,
         THREAD_BLOCK_SIZE,
-        FEATURE_BLOCK_SIZE,
         INTERNAL_BLOCK_SIZE,
         PADDING_SIZE);
 

From 6cddbb6e98f46ebd21d07d7e347402d03ad093c0 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 26 May 2025 15:42:48 +0200
Subject: [PATCH 25/93] Additional performance improvement tests.

---
 .../work_group/kernel_matrix_assembly.hpp     | 26 +++++++++++---
 src/plssvm/backends/SYCL/DPCPP/csvm.cpp       | 34 ++++++++++++++-----
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
index 6e7fd2033..560d556ea 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
@@ -16,6 +16,7 @@
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
@@ -27,10 +28,11 @@ namespace plssvm::sycl::detail::work_group {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @details target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
     /**
@@ -111,12 +113,26 @@ class device_kernel_assembly {
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-                    // perform the feature reduction calculation
-                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                    if constexpr (target == target_platform::gpu_amd) {
+                        // perform the feature reduction calculation, the block_dim is the slowest moving index
+                        for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                            data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the block_dim is the fastest moving index
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                        data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                real_type sum{ 0.0 };
+                                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                   data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
+                                temp[internal_i][internal_j] += sum;
                             }
                         }
                     }
diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
index 7c56bcd91..12910a7ae 100644
--- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
@@ -223,10 +223,12 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id)
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+// TODO: better!
+template <target_platform target, typename device_ptr_type, typename Device, typename Distribution>
+auto dispatch_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost, sycl::kernel_invocation_type invocation_type_, Device& devices_, Distribution& data_distribution_) {
     const std::size_t num_rows_reduced = data_d.shape().x - 1;
     const std::size_t num_features = data_d.shape().y;
-    const queue_type &device = devices_[device_id];
+    const auto &device = devices_[device_id];
 
     // calculate the number of data points this device is responsible for
     const std::size_t device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
@@ -260,7 +262,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
                     case sycl::kernel_invocation_type::work_group:
                         device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
                             cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_assembly<kernel_function_type::linear>{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
+                                             sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::linear>{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
                         });
                         break;
                     case sycl::kernel_invocation_type::hierarchical:
@@ -293,7 +295,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
                         break;
                     case sycl::kernel_invocation_type::work_group:
                         device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
                             cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
                                              functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
                         });
@@ -329,7 +331,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
                         break;
                     case sycl::kernel_invocation_type::work_group:
                         device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::rbf, real_type>;
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::rbf, real_type>;
                             cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
                                              functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
                         });
@@ -365,7 +367,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
                         break;
                     case sycl::kernel_invocation_type::work_group:
                         device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
                             cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
                                              functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
                         });
@@ -401,7 +403,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
                         break;
                     case sycl::kernel_invocation_type::work_group:
                         device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::laplacian, real_type>;
                             cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
                                              functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
                         });
@@ -437,7 +439,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
                         break;
                     case sycl::kernel_invocation_type::work_group:
                         device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
+                            using functor_type = sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::chi_squared, real_type>;
                             cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
                                              functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
                         });
@@ -467,6 +469,22 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     return kernel_matrix_d;
 }
 
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+    switch (target_) {
+        case target_platform::automatic:
+            // error
+            throw backend_exception{ "Can't determine the target platform!" };
+        case target_platform::gpu_nvidia:
+            return dispatch_assemble_kernel_matrix_explicit<target_platform::gpu_nvidia>(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_);
+        case target_platform::gpu_amd:
+            return dispatch_assemble_kernel_matrix_explicit<target_platform::gpu_amd>(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_);
+        case target_platform::gpu_intel:
+            return dispatch_assemble_kernel_matrix_explicit<target_platform::gpu_intel>(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_);
+        case target_platform::cpu:
+            return dispatch_assemble_kernel_matrix_explicit<target_platform::cpu>(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_);
+    }
+}
+
 void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const ::plssvm::detail::execution_range &mirror_exec, const real_type alpha, const device_ptr_type &A_d, const device_ptr_type &B_d, const real_type beta, device_ptr_type &C_d) const {
     const std::size_t num_rhs = B_d.shape().x;
     const std::size_t num_rows = B_d.shape().y;

From a185caf542bc6fd1e65230783e431f158e0633c4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 26 May 2025 16:25:07 +0200
Subject: [PATCH 26/93] Preliminary changes.

---
 .../SYCL/kernel/cg_explicit/basic/blas.hpp    |  2 +-
 .../kernel/cg_explicit/hierarchical/blas.hpp  | 30 ++++------
 .../SYCL/kernel/cg_explicit/scoped/blas.hpp   | 30 ++++------
 .../kernel/cg_explicit/work_group/blas.hpp    | 32 ++++------
 .../basic/kernel_matrix_assembly_blas.hpp     |  2 +-
 .../kernel_matrix_assembly_blas.hpp           | 60 +++++++++----------
 .../scoped/kernel_matrix_assembly_blas.hpp    | 56 +++++++----------
 .../kernel_matrix_assembly_blas.hpp           | 55 +++++++----------
 .../kernel/predict/basic/predict_kernel.hpp   |  2 +-
 .../predict/hierarchical/predict_kernel.hpp   | 39 +++++-------
 .../kernel/predict/scoped/predict_kernel.hpp  | 37 +++++-------
 .../predict/work_group/predict_kernel.hpp     | 40 +++++--------
 12 files changed, 151 insertions(+), 234 deletions(-)

diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
index 2e528149c..b55b374fe 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
@@ -13,7 +13,7 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 
 #include "sycl/sycl.hpp"  // sycl::item
 
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
index de6358ec8..5e5803652 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
@@ -13,7 +13,7 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
 
@@ -60,8 +60,8 @@ class device_kernel_symm {
      */
     void operator()(::sycl::group<2> group) const {
         // allocate shared memory
-        real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
         // calculate the indices used in the current work-item
         ::sycl::private_memory<std::size_t, 2> i{ group };
@@ -98,7 +98,7 @@ class device_kernel_symm {
         });
 
         // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
             // load data into local memory
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
@@ -119,15 +119,8 @@ class device_kernel_symm {
                     } else {
                         A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
                     }
-                    // determine on which side of the diagonal we are located
-                    if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) {
-                        A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }];
-                    } else {
-                        A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
-                    }
 
                     B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                    B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
                 }
             });
 
@@ -138,7 +131,7 @@ class device_kernel_symm {
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                             temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
@@ -227,8 +220,8 @@ class device_kernel_symm_mirror {
      */
     void operator()(::sycl::group<2> group) const {
         // allocate shared memory
-        real_type A_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type B_cache_[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
         // calculate the indices used in the current work-item
         ::sycl::private_memory<std::size_t, 2> i{ group };
@@ -264,7 +257,7 @@ class device_kernel_symm_mirror {
         });
 
         // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
             // load data into shared memory
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
@@ -279,12 +272,9 @@ class device_kernel_symm_mirror {
                     const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                     const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                    // store the values in the local memory
                     A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
-                    A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j];
-
                     B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                    B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
                 }
             });
 
@@ -295,7 +285,7 @@ class device_kernel_symm_mirror {
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                             temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
index 9e8500d73..2e6983255 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
@@ -13,7 +13,7 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
 
@@ -62,8 +62,8 @@ class device_kernel_symm {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
                                    ::sycl::require_private_mem<std::size_t>(),
                                    ::sycl::require_private_mem<std::size_t>(),
                                    ::sycl::require_private_mem<std::size_t>(),
@@ -88,7 +88,7 @@ class device_kernel_symm {
                                            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
                                        });
 
-                                       for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                       for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                            // load data into shared memory
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
@@ -109,15 +109,8 @@ class device_kernel_symm {
                                                    } else {
                                                        A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
                                                    }
-                                                   // determine on which side of the diagonal we are located
-                                                   if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) {
-                                                       A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }];
-                                                   } else {
-                                                       A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
-                                                   }
 
                                                    B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                                                   B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
                                                }
                                            });
 
@@ -126,7 +119,7 @@ class device_kernel_symm {
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                                                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                                                            temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
@@ -215,8 +208,8 @@ class device_kernel_symm_mirror {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
                                    ::sycl::require_private_mem<std::size_t>(),
                                    ::sycl::require_private_mem<std::size_t>(),
                                    ::sycl::require_private_mem<std::size_t>(),
@@ -241,7 +234,7 @@ class device_kernel_symm_mirror {
                                            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
                                        });
 
-                                       for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                       for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                            // load data into shared memory
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
@@ -256,12 +249,9 @@ class device_kernel_symm_mirror {
                                                    const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                                                    const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                   // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                                                   // store the values in the local memory
                                                    A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
-                                                   A_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j];
-
                                                    B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                                                   B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
                                                }
                                            });
 
@@ -270,7 +260,7 @@ class device_kernel_symm_mirror {
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                                                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                                                            temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
index ae07f7ec6..965b043a3 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
@@ -13,7 +13,7 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
@@ -43,8 +43,8 @@ class device_kernel_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
     device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        A_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        B_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        A_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        B_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         device_specific_num_rows_{ device_specific_num_rows },
@@ -75,7 +75,6 @@ class device_kernel_symm {
         const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
         const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
         const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
@@ -88,7 +87,7 @@ class device_kernel_symm {
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
         // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_uz) {
+        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += THREAD_BLOCK_SIZE_uz) {
             // load data into local memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
@@ -100,20 +99,13 @@ class device_kernel_symm {
                 } else {
                     A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
                 }
-                // determine on which side of the diagonal we are located
-                if (dim + threadIdx_x + THREAD_BLOCK_SIZE < global_j) {
-                    A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz + std::size_t{ 1 }) / std::size_t{ 2 }];
-                } else {
-                    A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
-                }
 
                 B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
             }
             nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
             // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                         temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
@@ -183,8 +175,8 @@ class device_kernel_symm_mirror {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
     device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        A_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        B_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        A_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        B_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
@@ -216,7 +208,6 @@ class device_kernel_symm_mirror {
         const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
         const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
         const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
@@ -229,23 +220,20 @@ class device_kernel_symm_mirror {
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
         // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += FEATURE_BLOCK_SIZE_uz) {
+        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                 const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                // store the values in the local memory
                 A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
-                A_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz - std::size_t{ 1 }) * (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) + global_j];
-
                 B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-                B_cache_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
             }
             nd_idx.barrier();  // wait until all threads loaded their part of the data
 
             // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                         temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
index 7b517a7b1..9c82ad31d 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::item
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
index 1a24024b6..342e8308b 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
@@ -80,6 +80,10 @@ class device_kernel_assembly_symm {
         ::sycl::private_memory<std::size_t, 2> j{ group };
         ::sycl::private_memory<std::size_t, 2> j_linear{ group };
 
+        // create the shared memory arrays used for caching data point features
+        real_type data_cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
         // initialize private and local variables
@@ -112,12 +116,12 @@ class device_kernel_assembly_symm {
         // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
         if (group[1] >= group[0]) {
             {
-                // allocate shared memory
-                real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-                real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+                // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto data_cache_i = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(data_cache_one);
+                auto data_cache_j = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(data_cache_two);
 
                 // iterate over all features using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                     // load data into local memory
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
@@ -132,11 +136,9 @@ class device_kernel_assembly_symm {
                             const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                             const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                            // store the values in the local memory
                             data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                            data_cache_i[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
                             data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                            data_cache_j[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
                         }
                     });
 
@@ -147,7 +149,7 @@ class device_kernel_assembly_symm {
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                        for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                                     temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
@@ -189,12 +191,12 @@ class device_kernel_assembly_symm {
 
             // calculate C += alpha * temp * B for the UPPER triangular matrix
             {
-                // allocate shared memory
-                real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
-                real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
+                // reinterpret the arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                auto B_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(data_cache_one);
+                auto C_out_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(data_cache_two);
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                     // load data into local memory
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
@@ -208,26 +210,24 @@ class device_kernel_assembly_symm {
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                             const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                            // store the values in the local memory
                             B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                            B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
                             C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 };
-                            C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
                         }
                     });
 
                     // implicit group barrier
 
                     // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                             const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                             const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                    C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_1) % FEATURE_BLOCK_SIZE] +=
-                                        temp(idx)[internal_i][internal_j] * B_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_1) % FEATURE_BLOCK_SIZE];
+                                    C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_1) % THREAD_BLOCK_SIZE] +=
+                                        temp(idx)[internal_i][internal_j] * B_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_1) % THREAD_BLOCK_SIZE];
                                 }
                             }
                         });
@@ -242,13 +242,11 @@ class device_kernel_assembly_symm {
 
                         const std::size_t threadIdx_y = idx.get_local_id(1);
 
-                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
                         const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                             const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal);
                             detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1];
-                            detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1 + THREAD_BLOCK_SIZE];
                         }
                     });
 
@@ -274,12 +272,12 @@ class device_kernel_assembly_symm {
 
             // calculate C += alpha * temp * B for the LOWER triangular matrix
             {
-                // allocate shared memory
-                real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-                real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+                // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto B_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(data_cache_one);
+                auto C_out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(data_cache_two);
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
@@ -293,26 +291,24 @@ class device_kernel_assembly_symm {
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                             const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                            // store the values in the shared memory
                             B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                            B_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
                             C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                            C_out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                         }
                     });
 
                     // implicit group barrier
 
                     // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                             const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                             const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                    C_out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
-                                        temp(idx)[internal_i][internal_j] * B_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
+                                    C_out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
+                                        temp(idx)[internal_i][internal_j] * B_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
                                 }
                             }
                         });
@@ -327,13 +323,11 @@ class device_kernel_assembly_symm {
 
                         const std::size_t threadIdx_x = idx.get_local_id(0);
 
-                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
                         const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                             const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal);
                             detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
-                            detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1];
                         }
                     });
 
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
index 4391f2f19..c84db480f 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
@@ -77,8 +77,8 @@ class device_kernel_assembly_symm {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
                                    ::sycl::require_private_mem<std::size_t>(),
                                    ::sycl::require_private_mem<std::size_t>(),
                                    ::sycl::require_private_mem<std::size_t>(),
@@ -106,7 +106,7 @@ class device_kernel_assembly_symm {
                                        // exploit symmetry
                                        if (group[1] >= group[0]) {
                                            // iterate over all features using blocking to be able to cache them for faster memory accesses
-                                           for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                           for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                // load data into local memory
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
@@ -121,11 +121,9 @@ class device_kernel_assembly_symm {
                                                        const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                                                        const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                       // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                                                       // store the values in the local memory
                                                        data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                                                       data_cache_i[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
                                                        data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                                                       data_cache_j[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
                                                    }
                                                });
 
@@ -134,7 +132,7 @@ class device_kernel_assembly_symm {
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                                   for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                                                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                                                                temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
@@ -172,11 +170,11 @@ class device_kernel_assembly_symm {
                                            // calculate C += alpha * temp * B for the UPPER triangular matrix
                                            {
                                                // rename cached arrays
-                                               auto &B_cache = data_cache_i;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
-                                               auto &C_out_cache = data_cache_j;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
+                                               auto &B_cache = data_cache_i;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                                               auto &C_out_cache = data_cache_j;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
 
                                                // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                                               for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                               for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                    // load data into local memory
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
@@ -190,24 +188,22 @@ class device_kernel_assembly_symm {
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                                                            const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                           // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                                                           B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                                                           B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
-                                                           C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };
-                                                           C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                                                           // store the values in the local memory
+                                                           B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
+                                                           C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };
                                                        }
                                                    });
 
                                                    // calculate intermediate results and store them in shared memory
-                                                   for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                                                   for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                            const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                            const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
                                                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                                   C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] +=
-                                                                       temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE];
+                                                                   C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE] +=
+                                                                       temp(idx)[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE];
                                                                }
                                                            }
                                                        });
@@ -220,13 +216,11 @@ class device_kernel_assembly_symm {
 
                                                        const std::size_t threadIdx_y = idx.get_local_id(group, 1);
 
-                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
                                                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                                                            const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal);
-                                                           detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1];
-                                                           detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE];
+                                                           detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1];
                                                        }
                                                    });
                                                }
@@ -249,11 +243,11 @@ class device_kernel_assembly_symm {
                                            // calculate C += alpha * temp * B for the LOWER triangular matrix
                                            {
                                                // allocate shared memory
-                                               auto &B_cache = data_cache_i;      // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-                                               auto &C_out_cache = data_cache_j;  // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                                               auto &B_cache = data_cache_i;      // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                                               auto &C_out_cache = data_cache_j;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
 
                                                // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                                               for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                               for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
@@ -267,26 +261,24 @@ class device_kernel_assembly_symm {
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                                                            const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                           // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                                                           // store the values in the shared memory
                                                            B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                                                           B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
                                                            C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                                                           C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                                                        }
                                                    });
 
                                                    // implicit group barrier
 
                                                    // calculate intermediate results and store them in shared memory
-                                                   for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                                                   for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                            const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                            const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
                                                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                                   C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
-                                                                       temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
+                                                                   C_out_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
+                                                                       temp(idx)[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
                                                                }
                                                            }
                                                        });
@@ -301,13 +293,11 @@ class device_kernel_assembly_symm {
 
                                                        const std::size_t threadIdx_x = idx.get_local_id(group, 0);
 
-                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
                                                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                                                            const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal);
                                                            detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
-                                                           detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
                                                        }
                                                    });
 
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
index 34b55fff4..2e6ea3f4f 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
@@ -54,8 +54,8 @@ class device_kernel_assembly_symm {
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
     device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        data_cache_i_{ ::sycl::range<1>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-        data_cache_j_{ ::sycl::range<1>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        data_cache_i_{ ::sycl::range<1>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        data_cache_j_{ ::sycl::range<1>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
         alpha_{ alpha },
         q_{ q },
         data_d_{ data_d },
@@ -90,7 +90,6 @@ class device_kernel_assembly_symm {
         const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
         const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
         const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
@@ -106,22 +105,20 @@ class device_kernel_assembly_symm {
 
             {
                 // iterate over all features using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) {
+                for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) {
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                         const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                         const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                        // store the values in the local memory
                         data_cache_i_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                        data_cache_i_[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
                         data_cache_j_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                        data_cache_j_[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
                     // perform the feature reduction calculation
-                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                                 temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
@@ -158,29 +155,27 @@ class device_kernel_assembly_symm {
             // calculate C += alpha * temp * B for the UPPER triangular matrix
             {
                 // rename cached arrays
-                auto &B_cache = data_cache_i_;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
-                auto &C_out_cache = data_cache_j_;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE]
+                auto &B_cache = data_cache_i_;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                auto &C_out_cache = data_cache_j_;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) {
+                for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) {
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                         const std::size_t global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
-                        B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                        B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
-                        C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };
-                        C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * FEATURE_BLOCK_SIZE + local_id_0 + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                        // store the values in the local memory
+                        B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
+                        C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
                     // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE] +=
-                                    temp[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * FEATURE_BLOCK_SIZE + (class_idx + local_id_1) % FEATURE_BLOCK_SIZE];
+                                C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal_j) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE] +=
+                                    temp[internal_i][internal_j] * B_cache[(local_id_1 * INTERNAL_BLOCK_SIZE + internal_i) * THREAD_BLOCK_SIZE + (class_idx + local_id_1) % THREAD_BLOCK_SIZE];
                             }
                         }
                         nd_idx.barrier();  // wait until all work-items performed their part of the calculations
@@ -189,8 +184,7 @@ class device_kernel_assembly_symm {
                     // add intermediate cached results to C
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                         const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal);
-                        detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1];
-                        detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * FEATURE_BLOCK_SIZE + local_id_1 + THREAD_BLOCK_SIZE];
+                        detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1];
                     }
                     nd_idx.barrier();  // wai until all work-items updated C with their values
                 }
@@ -211,29 +205,27 @@ class device_kernel_assembly_symm {
             // calculate C += alpha * temp * B for the LOWER triangular matrix
             {
                 // rename cached arrays
-                auto &B_cache = data_cache_i_;      // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-                auto &C_out_cache = data_cache_j_;  // [FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto &B_cache = data_cache_i_;      // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto &C_out_cache = data_cache_j_;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) {
+                for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) {
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                         const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                        // store the in the shared memory
                         B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                        B_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
                         C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                        C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
                     // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                C_out_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
-                                    temp[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % FEATURE_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
+                                C_out_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal_i * THREAD_BLOCK_SIZE + local_id_1] +=
+                                    temp[internal_i][internal_j] * B_cache[((class_idx + local_id_0) % THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j];
                             }
                         }
                         nd_idx.barrier();  // wait until all work-items performed their part of the calculations
@@ -243,7 +235,6 @@ class device_kernel_assembly_symm {
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                         const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal);
                         detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
-                        detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += C_out_cache[(local_id_0 + THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
                     }
                     nd_idx.barrier();  // wait until all threads updated C with their values
                 }
diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
index c16965cb1..631bf80a1 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::item
diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
index 4098c4914..dedfe609e 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
@@ -202,8 +202,8 @@ class device_kernel_predict_linear {
      */
     void operator()(::sycl::group<2> group) const {
         // allocate shared memory
-        real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_cache_w[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
         // calculate the indices used in the current work-item
         ::sycl::private_memory<std::size_t, 2> pp_idx{ group };
@@ -241,7 +241,7 @@ class device_kernel_predict_linear {
         // implicit group barrier
 
         // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+        for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
@@ -256,11 +256,9 @@ class device_kernel_predict_linear {
                     const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                     const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                    // store the values in the local memory
                     data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                    data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
                     data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
-                    data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
                 }
             });
 
@@ -271,7 +269,7 @@ class device_kernel_predict_linear {
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                     for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
                         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
                             temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
@@ -356,8 +354,8 @@ class device_kernel_predict {
      */
     void operator()(::sycl::group<2> group) const {
         // allocate shared memory
-        real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type data_cache_sv[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
         // calculate the indices used in the current work-item
         ::sycl::private_memory<std::size_t, 2> pp_idx{ group };
@@ -393,7 +391,7 @@ class device_kernel_predict {
 
         {
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
@@ -408,11 +406,9 @@ class device_kernel_predict {
                         const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                         const auto global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                        // store the values in the shared memory
                         data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                        data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
                         data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                        data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
                     }
                 });
 
@@ -423,7 +419,7 @@ class device_kernel_predict {
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                         for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
                                 temp(idx)[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
@@ -454,7 +450,7 @@ class device_kernel_predict {
             // auto &out_cache = data_cache_sv;
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+            for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                 // load data into local memory
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
@@ -470,15 +466,12 @@ class device_kernel_predict {
                         const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                         data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                        data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
 
                         // the bias (rho) must only be applied once for all support vectors
                         if (blockIdx_x == std::size_t{ 0 }) {
                             data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
-                            data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
                         } else {
                             data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                            data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                         }
                     }
                 });
@@ -486,15 +479,15 @@ class device_kernel_predict {
                 // implicit group barrier
 
                 // calculate intermediate results and store them in local memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
                         for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                data_cache_sv[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
-                                    temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                                data_cache_sv[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
+                                    temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
                             }
                         }
                     });
@@ -509,14 +502,12 @@ class device_kernel_predict {
 
                     const std::size_t threadIdx_x = idx.get_local_id(0);
 
-                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
                     const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                         const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal);
 
                         detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
-                        detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1];
                     }
                 });
 
diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
index 1a42161f5..e6d56ec56 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
@@ -191,8 +191,8 @@ class device_kernel_predict_linear {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
                                    ::sycl::require_private_mem<std::size_t>(),
                                    ::sycl::require_private_mem<std::size_t>(),
                                    ::sycl::require_private_mem<std::size_t>(),
@@ -218,7 +218,7 @@ class device_kernel_predict_linear {
                                        });
 
                                        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-                                       for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                       for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
@@ -233,11 +233,9 @@ class device_kernel_predict_linear {
                                                    const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                                                    const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                   // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                                                   // store the values in the local memory
                                                    data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                                                   data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
                                                    data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
-                                                   data_cache_w[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
                                                }
                                            });
 
@@ -246,7 +244,7 @@ class device_kernel_predict_linear {
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                                                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
                                                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
                                                            temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
@@ -332,8 +330,8 @@ class device_kernel_predict {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
                                    ::sycl::require_private_mem<std::size_t>(),
                                    ::sycl::require_private_mem<std::size_t>(),
                                    ::sycl::require_private_mem<std::size_t>(),
@@ -356,7 +354,7 @@ class device_kernel_predict {
                                        });
 
                                        // iterate over all features using blocking to be able to cache them for faster memory accesses
-                                       for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                       for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
@@ -371,11 +369,9 @@ class device_kernel_predict {
                                                    const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                                                    const auto global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                   // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                                                   // store the values in the shared memory
                                                    data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                                                   data_cache_pp[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
                                                    data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                                                   data_cache_sv[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
                                                }
                                            });
 
@@ -384,7 +380,7 @@ class device_kernel_predict {
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                                                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
                                                        for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
                                                            temp(idx)[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
@@ -410,7 +406,7 @@ class device_kernel_predict {
                                            auto &out_cache = data_cache_sv;
 
                                            // iterate over all features using blocking to be able to cache them for faster memory accesses
-                                           for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(FEATURE_BLOCK_SIZE)) {
+                                           for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                // load data into local memory
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
@@ -426,29 +422,26 @@ class device_kernel_predict {
                                                        const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                                                        alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                                                       alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
 
                                                        // the bias (rho) must only be applied once for all support vectors
                                                        if (blockIdx_x == std::size_t{ 0 }) {
                                                            out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
-                                                           out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
                                                        } else {
                                                            out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                                                           out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                                                        }
                                                    }
                                                });
 
                                                // calculate intermediate results and store them in local memory
-                                               for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                                               for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
                                                        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
                                                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                                               out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
-                                                                   temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                                                               out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
+                                                                   temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
                                                            }
                                                        }
                                                    });
diff --git a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
index d451ac7d5..6612a10d8 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
@@ -159,8 +159,8 @@ class device_kernel_predict_linear {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
     device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        data_cache_pp_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        data_cache_w_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        data_cache_pp_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        data_cache_w_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
         prediction_d_{ prediction_d },
         w_d_{ w_d },
         rho_d_{ rho_d },
@@ -189,7 +189,6 @@ class device_kernel_predict_linear {
         const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
         const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
         const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
@@ -202,22 +201,20 @@ class device_kernel_predict_linear {
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
         // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) {
+        for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                 const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the local memory
+                // store the values in the local memory
                 data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                data_cache_pp_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
                 data_cache_w_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
-                data_cache_w_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
             }
             nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
             // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                 for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
                     for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
                         temp[internal_pd][internal_class] += data_cache_w_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
@@ -283,8 +280,8 @@ class device_kernel_predict {
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
     device_kernel_predict(::sycl::handler &cgh, real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        data_cache_pp_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        data_cache_sv_{ ::sycl::range<2>{ static_cast<std::size_t>(FEATURE_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        data_cache_pp_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        data_cache_sv_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
         prediction_d_{ prediction_d },
         alpha_d_{ alpha_d },
         rho_d_{ rho_d },
@@ -316,7 +313,6 @@ class device_kernel_predict {
         const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
         const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_uz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
         const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
@@ -329,22 +325,20 @@ class device_kernel_predict {
 
         {
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_uz) {
+            for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                     const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
                     const auto global_sv_idx = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
+                    // store the values in the shared memory
                     data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                    data_cache_pp_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
                     data_cache_sv_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                    data_cache_sv_[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
                 }
                 nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
                 // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                     for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
                             temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
@@ -369,31 +363,28 @@ class device_kernel_predict {
             auto &out_cache = data_cache_sv_;
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_uz) {
+            for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                     const std::size_t global_sv_idx = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                     alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                    alpha_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x + THREAD_BLOCK_SIZE_uz) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
 
                     // the bias (rho) must only be applied once for all support vectors
                     if (blockIdx_x == std::size_t{ 0 }) {
                         out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
-                        out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x + THREAD_BLOCK_SIZE_uz];
                     } else {
                         out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
-                        out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                     }
                 }
                 nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
                 // calculate intermediate results and store them in local memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            out_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
-                                temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % FEATURE_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                            out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
+                                temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
                         }
                     }
                     nd_idx.barrier();  // wait until all work-items performed their part of the calculations
@@ -404,7 +395,6 @@ class device_kernel_predict {
                     const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
                     detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
-                    detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1];
                 }
                 nd_idx.barrier();  // wait until all work-items updated their part of the prediction
             }

From c74aca83b21f21ed12fa6257ca347d35f41997f2 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 28 May 2025 13:48:21 +0200
Subject: [PATCH 27/93] Update CUDA implementation and update comments.

---
 .../backends/CUDA/kernel/cg_explicit/blas.cuh | 295 +++++++-------
 .../cg_explicit/kernel_matrix_assembly.cuh    |  44 +--
 .../kernel_matrix_assembly_blas.cuh           | 179 +++++----
 .../backends/CUDA/kernel/kernel_functions.cuh |  34 +-
 .../backends/CUDA/kernel/predict_kernel.cuh   | 374 +++++++++---------
 5 files changed, 471 insertions(+), 455 deletions(-)

diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
index 2f7b37a0f..1a6be4ae8 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
@@ -13,7 +13,7 @@
 #define PLSSVM_BACKENDS_CUDA_KERNEL_CG_EXPLICIT_BLAS_CUH_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 
 namespace plssvm::cuda::detail {
 
@@ -22,8 +22,8 @@ namespace plssvm::cuda::detail {
  * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for!
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -32,78 +32,77 @@ namespace plssvm::cuda::detail {
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_symm(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # rhs -> num_rhs
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # rows -> device_specific_num_rows
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_i = i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_j = j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // determine on which side of the diagonal we are located
-            if (dim + threadIdx_y < global_j) {
-                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + 1ull) / 2ull];
-            } else {
-                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y - global_j * (global_j + 1ull) / 2ull];
-            }
-            // determine on which side of the diagonal we are located
-            if (dim + threadIdx.y + THREAD_BLOCK_SIZE < global_j) {
-                A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull + 1ull) / 2ull];
-            } else {
-                A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - global_j * (global_j + 1ull) / 2ull];
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+        // iterate over all values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                // determine on which side of the diagonal we are located
+                if (dim + threadIdx_y < global_j_idx_linear) {
+                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                } else {
+                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                }
+                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
             }
-
-            B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i];
-            B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
-
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
+    // calculate the indices used in the current thread
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
     // apply the (partial) BLAS operation and update C
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-            const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
-
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && device_global_j < device_specific_num_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i];
+            // calculate the indices to access the global data and the data with respect to the current device
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+            const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -115,8 +114,8 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
  * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -125,68 +124,72 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_symm_mirror(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long num_mirror_rows, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # rhs -> num_rhs
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # rows -> num_mirror_rows
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_i = i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_j = j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y - 1ull) * (dim + threadIdx_y) / 2ull + device_specific_num_rows - (dim + threadIdx_y) + global_j];
-            A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - 1ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) / 2ull + device_specific_num_rows - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) + global_j];
-            B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i];
-            B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
-
-        // perform the feature reduction calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_mirror_rows
+
+        // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim + threadIdx_y) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                             // SoA
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
+    // calculate the indices used in the current thread
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
+
     // apply the (remaining) BLAS operation and update C
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto partial_global_j = j + static_cast<unsigned long long>(internal_j);
-            const auto global_j = row_offset + device_specific_num_rows + j + static_cast<unsigned long long>(internal_j);
-
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && partial_global_j < num_mirror_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i];
+            // calculate the indices to access the global data and the data with respect to the current device
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+            const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
+
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -200,27 +203,29 @@ __global__ void device_kernel_symm_mirror(const unsigned long long num_rows, con
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_inplace_matrix_add(const unsigned long long num_cols, real_type *lhs, const real_type *rhs, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # num_rows
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # num_rhs
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto global_j = j + static_cast<unsigned long long>(internal_j);
+            // calculate the indices to access the global data
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j];
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx];  // SoA
         }
     }
 }
@@ -233,27 +238,29 @@ __global__ void device_kernel_inplace_matrix_add(const unsigned long long num_co
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_inplace_matrix_scale(const unsigned long long num_cols, real_type *lhs, const real_type scale, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # num_rows
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # num_rhs
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto global_j = j + static_cast<unsigned long long>(internal_j);
+            // calculate the indices to access the global data
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] *= scale;
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale;  // SoA
         }
     }
 }
diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
index 2a3eef5c4..e4a3fa22d 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
@@ -52,7 +52,7 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type
     const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
     const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
-    // create two shared memory arrays used for caching data point features
+    // create two shared memory arrays used for caching
     __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
     __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
@@ -62,21 +62,21 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
         {
-            // calculate the indices used in the current thread paying attention to coalesced memory accesses
-            const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
-            const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+            const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
             for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    // calculate the indices to access the global data points, pays attention to coalesced memory accesses
-                    const auto global_i_linear = device_row_offset + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_j_linear = device_row_offset + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                     // store the values in the shared memory
-                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
-                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
@@ -94,29 +94,29 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type
         }
 
         // calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                // calculate the indices to access the global data points and wrt the current device
-                const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_i = device_row_offset + device_global_i;
-                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = device_row_offset + device_global_j;
-
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                     real_type temp_ij = temp[internal_i][internal_j];
                     // apply the final kernel function
-                    temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j];
+                    temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp_ij += cost;
                     }
                     // update the upper triangular kernel matrix
-                    kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                    kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                 }
             }
         }
diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
index 62f24d6bf..8e8dd03c2 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/CUDA/kernel/detail/atomics.cuh"    // atomicAdd for double precision floating point numbers on older CUDA hardware
 #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh"  // plssvm::cuda::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 namespace plssvm::cuda::detail {
@@ -26,10 +26,10 @@ namespace plssvm::cuda::detail {
  * @tparam Args the types of the parameters necessary for the specific kernel function
  * @param[in] alpha the scalar alpha value
  * @param[in] q the vector used in the dimensional reduction
- * @param[in] data_d the data points to calculate the implicit kernel matrix from
+ * @param[in] data the data points to calculate the implicit kernel matrix from
  * @param[in] num_rows the total number of data points (= total number of rows)
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] QA_cost the scalar used in the dimensional reduction
  * @param[in] cost the cost factor the diagonal is scaled with
@@ -41,56 +41,64 @@ namespace plssvm::cuda::detail {
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const unsigned long long num_classes, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+
+    // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+    const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+    const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
     if (blockIdx_x >= blockIdx_y) {
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
+        //*************************************************************************//
+        //                   inplace kernel matrix construction                    //
+        //*************************************************************************//
         {
-            // create the shared memory arrays used for caching data point features
-            __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-            __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            auto data_i_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+            auto data_j_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-                    const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                    data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                    data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-                    data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the shared memory
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                    data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
                         }
                     }
                 }
@@ -101,16 +109,18 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto device_global_i = i + static_cast<unsigned long long>(internal_i);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
-                const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) {
-                    temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out of bounds accesses (only using the upper triangular matrix)
+                if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) {
+                    // apply the final kernel function
+                    temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp[internal_i][internal_j] += cost;
                     }
                 } else {
@@ -120,42 +130,44 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
             }
         }
 
-        // calculate C += alpha * temp * B for the UPPER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+        //*************************************************************************//
         {
-            // same shared memory size but with different dimensions
-            __shared__ real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
-            __shared__ real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+            auto B_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_one);
+            auto C_out_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y];
-                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 };
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                    // store the values in the shared memory
+                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y];  // SoA
+                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 };                                                                  // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE] +=
-                                temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE];
+                            C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE] +=
+                                temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE];
                         }
                     }
                     __syncthreads();  // wait until all threads performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_j = row_offset + j + static_cast<unsigned long long>(internal);
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]);
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x + THREAD_BLOCK_SIZE]);
+                    // calculate the indices to access the global data
+                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal);
+
+                    atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]);  // SoA
                 }
                 __syncthreads();  // wai until all threads updated C with their values
             }
@@ -164,51 +176,54 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
         // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
-                if (global_i == global_j) {
+                // update the diagonal
+                if (global_i_idx == global_j_idx) {
                     temp[internal_i][internal_j] = real_type{ 0.0 };
                 }
             }
         }
-
-        // calculate C += alpha * temp * B for the LOWER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+        //*************************************************************************//
         {
-            // same shared memory size but with different dimensions
-            __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-            __shared__ real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            auto B_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+            auto C_out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y];
-                    B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
+                    // store the values in the shared memory
+                    B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y];  // SoA
                     C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
-                    C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] +=
-                                temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j];
+                            C_out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] +=
+                                temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j];
                         }
                     }
                     __syncthreads();  // wait until all threads performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i + static_cast<unsigned long long>(internal);
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal);
+
+                    atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);  // SoA
                 }
                 __syncthreads();  // wait until all threads updated C with their values
             }
diff --git a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh
index 8003a51a3..72a4499ae 100644
--- a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh
+++ b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh
@@ -57,36 +57,12 @@ template <>
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] __device__ __forceinline__ real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
index 204d6bd97..5469b01d9 100644
--- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
+++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
@@ -15,166 +15,178 @@
 
 #include "plssvm/backends/CUDA/kernel/detail/atomics.cuh"    // atomicAdd for double precision floating point numbers on older CUDA hardware
 #include "plssvm/backends/CUDA/kernel/kernel_functions.cuh"  // plssvm::cuda::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
 namespace plssvm::cuda::detail {
 
 /**
  * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
- * @param[out] w_d the vector to speedup the linear prediction
- * @param[in] alpha_d the previously learned weights
- * @param[in] sv_d the support vectors
+ * @param[out] w the vector to speedup the linear prediction
+ * @param[in] alpha the previously learned weights
+ * @param[in] sv the support vectors
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+ * @param[in] device_num_sv the number of support vectors the current device is responsible for
+ * @param[in] sv_offset the first support vector (row in @p alpha) the current device is responsible for
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long device_specific_num_sv, const unsigned long long sv_offset, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_feature_idx = feature_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_class_idx = class_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_features
+        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;    // num_classes
 
-            data_cache_feature[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ull) + sv + threadIdx_y];  // SoA
-            data_cache_alpha[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ull) + sv + sv_offset + threadIdx_y];       // AoS
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv + threadIdx_y];  // SoA
+                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv + sv_offset + threadIdx_y];           // AoS
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
 
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_feature][internal_class] += data_cache_alpha[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature];
+            // perform the dot product calculation
+            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_feature][internal_class] += alpha_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
-    // update global array with local one
+    // calculate the indices used in the current thread
+    const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
+
+    // update the global w-vector with the locally cached values
     for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-            const auto global_feature_idx = feature_idx + static_cast<unsigned long long>(internal_feature);
-            const auto global_class_idx = class_idx + static_cast<unsigned long long>(internal_class);
+            // calculate the indices to access the global data
+            const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-            w_d[global_feature_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_feature][internal_class];
+            w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
         }
     }
 }
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
- * @param[out] prediction_d the predicted values
- * @param[in] w_d the vector to speedup the calculations
- * @param[in] rho_d the previously learned bias
- * @param[in] predict_points_d the data points to predict
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
+ * @param[out] prediction the predicted values
+ * @param[in] w the vector to speedup the calculations
+ * @param[in] rho the previously learned bias
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_predict_points the number of data points to predict
  * @param[in] num_features the number of features per data point
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_pp_idx = pp_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_class_idx = class_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-            data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-            data_cache_w[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ull) + global_class_idx];
-            data_cache_w[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_classes + PADDING_SIZE_ull) + global_class_idx];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;     // num_predict_points
+        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_classes
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_pd][internal_class] += data_cache_w[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd];
+                // store the values in the shared memory
+                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_pp][internal_class] += w_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
-    // update global array with local one
-    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    // calculate the indices used in the current thread
+    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+    // update the global array with the local one
+    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-            const auto global_pp_idx = pp_idx + static_cast<unsigned long long>(internal_pd);
-            const auto global_class_idx = class_idx + static_cast<unsigned long long>(internal_class);
+            // calculate the indices to access the global data
+            const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-            prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx];
+            prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx];  // AoS
         }
     }
 }
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
- * @param[in] prediction_d the predicted values
- * @param[in] alpha_d the previously learned weights
- * @param[in] rho_d the previously learned biases
- * @param[in] sv_d the support vectors
- * @param[in] predict_points_d the data points to predict
+ * @param[in] prediction the predicted values
+ * @param[in] alpha the previously learned weights
+ * @param[in] rho the previously learned biases
+ * @param[in] sv the support vectors
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
  * @param[in] num_predict_points the number of data points to predict
@@ -184,53 +196,55 @@ __global__ void device_kernel_predict_linear(real_type *prediction_d, const real
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
+__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
     {
-        // create the shared memory arrays used for caching data point features
-        __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        __shared__ real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        auto pp_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+        auto sv_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
+
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_predict_points
+        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
 
         // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
+        for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE;
-                const auto global_sv_idx = sv_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-                data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-                data_cache_sv[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
-                data_cache_sv[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
+                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
+
+                // store the values in the shared memory
+                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];                          // SoA
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                  data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]);
+                        temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                  pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]);
                     }
                 }
             }
@@ -239,57 +253,61 @@ __global__ void device_kernel_predict(real_type *prediction_d, const real_type *
     }
 
     // update temp using the respective kernel function
-    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-            temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter...);
+            temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter...);
         }
     }
 
     {
-        // same shared memory size but with different dimensions
-        __shared__ real_type alpha_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        __shared__ real_type out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const unsigned long long global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
-                alpha_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
-
-                // the bias (rho) must only be applied once for all support vectors
-                if (blockIdx_y == 0ull) {
-                    out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y];
-                    out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
-                } else {
-                    out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
-                    out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
+        // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        auto alpha_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+        auto out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
+
+        {
+            // calculate the indices used in the current thread
+            const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+            // iterate over all classes using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) {
+                // load data into shared memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const std::size_t global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
+
+                    // store the values in the shared memory
+                    alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
+                    // the bias (rho) must only be applied once for all support vectors
+                    if (blockIdx_y == std::size_t{ 0 }) {
+                        out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[dim + threadIdx_y];
+                    } else {
+                        out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
+                    }
                 }
-            }
-            __syncthreads();  // wait until all threads loaded their part of the data
-
-            // calculate intermediate results and store them in shared memory
-            for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                    for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + threadIdx.x] +=
-                            temp[internal_pd][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv];
+                __syncthreads();  // wait until all threads loaded their part of the data
+
+                // calculate intermediate results and store them in shared memory
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                            out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] +=
+                                temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv];
+                        }
                     }
+                    __syncthreads();  // wait until all threads performed their part of the calculations
                 }
-                __syncthreads();  // wait until all threads performed their part of the calculations
-            }
 
-            // add intermediate cached results to prediction_d
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx + static_cast<unsigned long long>(internal);
+                // atomically add the intermediate cached results to the prediction
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
+                    atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
+                }
+                __syncthreads();  // wait until all threads updated their part of the prediction
             }
-            __syncthreads();  // wait until all threads updated their part of the prediction
         }
     }
 }

From dbc00aed81991c4ff140be152209954e458d4994 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Wed, 28 May 2025 22:23:13 +0200
Subject: [PATCH 28/93] Do not use std::vector directly for the kernel matrix
 since it sequentially initializes all values to zero. Instead, use a
 std::unique_ptr together with a C++17 conformant make_unique_for_overwrite
 implementation followed by an OpenMP parallel zero initialization of all
 values drastically reducing the overhead.

---
 .../OpenMP/kernel/cg_explicit/blas.hpp        |   6 +-
 .../cg_explicit/kernel_matrix_assembly.hpp    |   3 +-
 .../detail/make_unique_for_overwrite.hpp      | 101 ++++++++++++++++++
 src/plssvm/backends/OpenMP/csvm.cpp           |  35 ++++--
 4 files changed, 129 insertions(+), 16 deletions(-)
 create mode 100644 include/plssvm/detail/make_unique_for_overwrite.hpp

diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
index e1041024a..ff7fc6f36 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
@@ -37,9 +37,8 @@ namespace plssvm::openmp::detail {
  * @param[in] beta the scalar beta value
  * @param[in,out] C the matrix @p C, also used as result matrix
  */
-inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
     // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
@@ -119,9 +118,8 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
  * @param[in] beta the scalar beta value
  * @param[in,out] C the matrix @p C, also used as result matrix
  */
-inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
     // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 9403b12a1..9571513b9 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -40,9 +40,8 @@ namespace plssvm::openmp::detail {
  * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
  */
 template <kernel_function_type kernel, typename... Args>
-void device_kernel_assembly(std::vector<real_type> &kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+void device_kernel_assembly(real_type*kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
     PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!");
     PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
     PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
     PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp
new file mode 100644
index 000000000..51b56e126
--- /dev/null
+++ b/include/plssvm/detail/make_unique_for_overwrite.hpp
@@ -0,0 +1,101 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite.
+ * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
+ */
+
+#ifndef PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_
+#define PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_
+
+#include <cstddef>      // std::size_t
+#include <memory>       // std::unique_ptr
+#include <type_traits>  // std::false_type, std::true_type, std::enable_if_t, std::is_array_v
+
+namespace plssvm::detail {
+
+/**
+ * @brief Helper struct to check whether @p T is an unbounded array.
+ * @tparam T the array type
+ */
+template <typename T>
+struct is_unbounded_array : std::false_type { };
+
+/**
+ * @brief Specialization of @ref is_unbounded_array for unbounded arrays.
+ * @tparam T the array type
+ */
+template <typename T>
+struct is_unbounded_array<T[]> : std::true_type { };
+
+/**
+ * @brief Shortcut for @ref is_unbounded_array::value.
+ * @tparam T the array type
+ */
+template <typename T>
+constexpr bool is_unbounded_array_v = is_unbounded_array<T>::value;
+
+/**
+ * @brief Helper struct to check whether @p T is a bounded array.
+ * @tparam T the array type
+ */
+template <typename T>
+struct is_bounded_array : std::false_type { };
+
+/**
+ * @brief Specialization of @ref is_bounded_array for unbounded arrays.
+ * @tparam T the array type
+ * @tparam N the size of the array
+ */
+template <typename T, std::size_t N>
+struct is_bounded_array<T[N]> : std::true_type { };
+
+/**
+ * @brief Shortcut for @ref is_unbounded_array::value.
+ * @tparam T the array type
+ */
+template <typename T>
+constexpr bool is_bounded_array_v = is_bounded_array<T>::value;
+
+/**
+ * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite.
+ * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
+ * @tparam T the type of the object to create
+ * @return a unique pointer to the newly created object (`[[nodiscard]]`)
+ */
+template <class T, std::enable_if_t<std::is_array_v<T>, bool> = true>
+[[nodiscard]] std::unique_ptr<T> make_unique_for_overwrite() {
+    return std::unique_ptr<T>(new T);
+}
+
+/**
+ * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite.
+ * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
+ * @tparam T the type of the objects to create
+ * @param[in] n the size of the array to create
+ * @return a unique pointer to the newly created object (`[[nodiscard]]`)
+ */
+template <class T, std::enable_if_t<is_unbounded_array_v<T>, bool> = true>
+std::unique_ptr<T> make_unique_for_overwrite(const std::size_t n) {
+    return std::unique_ptr<T>(new std::remove_extent_t<T>[n]);
+}
+
+/**
+ * @brief A C++17 conform implementation of C++20's std::make_unique_for_overwrite.
+ * @details For implementation details see: https://en.cppreference.com/w/cpp/memory/unique_ptr/make_unique
+ * @tparam T the type of the object to create
+ * @tparam Args the types of the constructor arguments
+ * @param[in] args the arguments to pass to the constructor
+ * @return a unique pointer to the newly created object (`[[nodiscard]]`)
+ */
+template <class T, class... Args, std::enable_if_t<is_bounded_array_v<T>, bool> = true>
+auto make_unique_for_overwrite(Args &&...args) = delete;
+
+}  // namespace plssvm::detail
+
+#endif  // PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_
diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp
index 7a7c17ef2..656d966f3 100644
--- a/src/plssvm/backends/OpenMP/csvm.cpp
+++ b/src/plssvm/backends/OpenMP/csvm.cpp
@@ -19,6 +19,7 @@
 #include "plssvm/detail/assert.hpp"                                                   // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/logging/mpi_log_untracked.hpp"                                // plssvm::detail::log_untracked
+#include "plssvm/detail/make_unique_for_overwrite.hpp"                                // plssvm::detail::make_unique_for_overwrite
 #include "plssvm/detail/memory_size.hpp"                                              // plssvm::detail::memory_size
 #include "plssvm/detail/move_only_any.hpp"                                            // plssvm::detail::{move_only_any, move_only_any_cast}
 #include "plssvm/detail/tracking/performance_tracker.hpp"                             // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY
@@ -125,26 +126,40 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
                     // get the offset of the data points this device is responsible for
                     const std::size_t row_offset = dist.place_row_offset(0);
 
-                    std::vector<real_type> kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));  // only explicitly store the upper triangular matrix
+                    // get the number of kernel matrix entries
+                    const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0);
+
+                    // only explicitly store the upper triangular matrix
+                    auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite<real_type[]>(num_entries);
+                    // initialize kernel matrix to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset
+#if defined(_OPENMP)
+    #pragma omp parallel for
+                    for (std::size_t i = 0; i < num_entries; ++i) {
+                        kernel_matrix[i] = real_type{ 0.0 };
+                    }
+#else
+                    std::memset(kernel_matrix.get(), 0, num_entries * sizeof(real_type));
+#endif
+
                     const auto start = std::chrono::steady_clock::now();
                     switch (params.kernel_type) {
                         case kernel_function_type::linear:
-                            detail::device_kernel_assembly<kernel_function_type::linear>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
+                            detail::device_kernel_assembly<kernel_function_type::linear>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
                             break;
                         case kernel_function_type::polynomial:
-                            detail::device_kernel_assembly<kernel_function_type::polynomial>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                            detail::device_kernel_assembly<kernel_function_type::polynomial>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<real_type>(params.gamma), params.coef0);
                             break;
                         case kernel_function_type::rbf:
-                            detail::device_kernel_assembly<kernel_function_type::rbf>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                            detail::device_kernel_assembly<kernel_function_type::rbf>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                             break;
                         case kernel_function_type::sigmoid:
-                            detail::device_kernel_assembly<kernel_function_type::sigmoid>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma), params.coef0);
+                            detail::device_kernel_assembly<kernel_function_type::sigmoid>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma), params.coef0);
                             break;
                         case kernel_function_type::laplacian:
-                            detail::device_kernel_assembly<kernel_function_type::laplacian>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                            detail::device_kernel_assembly<kernel_function_type::laplacian>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                             break;
                         case kernel_function_type::chi_squared:
-                            detail::device_kernel_assembly<kernel_function_type::chi_squared>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                            detail::device_kernel_assembly<kernel_function_type::chi_squared>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                             break;
                     }
                     const auto end = std::chrono::steady_clock::now();
@@ -202,16 +217,16 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                 break;
             case solver_type::cg_explicit:
                 {
-                    const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::vector<real_type> &>(A.front());
+                    const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::unique_ptr<real_type[]> &>(A.front());
                     PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!");
 
                     const auto start = std::chrono::steady_clock::now();
 
-                    detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                    detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
 
                     const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
                     if (num_mirror_rows > std::size_t{ 0 }) {
-                        detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                        detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
                     }
 
                     const auto end = std::chrono::steady_clock::now();

From 10d303e3b4fe9835aab75ba3870b5bfe7c276678 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 29 May 2025 17:20:30 +0200
Subject: [PATCH 29/93] Improve the performance of the OpenMP cg_explicit
 kernel matrix assembly and BLAS implementation. Align names more to the ones
 used in the other backends.

---
 .../OpenMP/kernel/cg_explicit/blas.hpp        | 111 ++++++++++--------
 .../cg_explicit/kernel_matrix_assembly.hpp    |  70 ++++++-----
 .../OpenMP/kernel/kernel_functions.hpp        |  35 +-----
 3 files changed, 104 insertions(+), 112 deletions(-)

diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
index ff7fc6f36..ecd80ab1a 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
@@ -21,7 +21,6 @@
 #include <array>    // std::array
 #include <cmath>    // std::ceil
 #include <cstddef>  // std::size_t
-#include <vector>   // std::vector
 
 namespace plssvm::openmp::detail {
 
@@ -29,24 +28,24 @@ namespace plssvm::openmp::detail {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @param[in] num_rows the number of rows and columns in @p A
  * @param[in] num_rhs the number of rows in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
  * @param[in] beta the scalar beta value
  * @param[in,out] C the matrix @p C, also used as result matrix
  */
-inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
     // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
 
     // calculate constants
     const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
@@ -60,28 +59,33 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
             for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) {
                 for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
                     // calculate the indices used in the current thread
-                    const std::size_t rhs_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    // iterate over all features
-                    for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) {
+                    // iterate over all values
+                    for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                                const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                                real_type A_val = 0.0;
-                                // determine on which side of the diagonal we are located
-                                if (dim < global_row) {
-                                    A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }];
-                                } else {
-                                    A_val = A[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                                real_type sum{ 0.0 };
+                                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                    real_type A_cache = 0.0;
+                                    // determine on which side of the diagonal we are located
+                                    if (dim + block_dim < global_j_idx) {
+                                        A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim + block_dim) * (dim + block_dim + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                    } else {
+                                        A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + block_dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                    }
+                                    sum += A_cache * B(global_i_idx, dim + block_dim + device_row_offset);
                                 }
-                                temp[internal_i][internal_j] += A_val * B(global_rhs, dim + row_offset);
+                                temp[internal_i][internal_j] += sum;
                             }
                         }
                     }
@@ -89,13 +93,14 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
                     // apply the (partial) BLAS operation and update C
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                            const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                            const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_j);
-
-                            // be sure to not perform out of bounds accesses
-                            if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) {
-                                C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row);
+                            // calculate the indices to access the global data and the data with respect to the current device
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                            const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                            // be sure to not perform out-of-bounds accesses
+                            if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                                C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx);
                             }
                         }
                     }
@@ -110,21 +115,21 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
  * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
  * @param[in] beta the scalar beta value
  * @param[in,out] C the matrix @p C, also used as result matrix
  */
-inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
     // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
     PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
 
     // calculate constants
     const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
@@ -142,36 +147,42 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
             for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) {
                 for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
                     // calculate the indices used in the current thread
-                    const std::size_t rhs_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    // iterate over all features
-                    for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) {
+                    // iterate over the remaining values
+                    for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                                const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                                const real_type A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row];
-                                temp[internal_i][internal_j] += A_val * B(global_rhs, row_offset + dim);
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                                real_type sum{ 0.0 };
+                                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                    const real_type A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + block_dim - std::size_t{ 1 }) * (dim + block_dim) / std::size_t{ 2 } + device_num_rows - dim + block_dim + global_j_idx];
+                                    sum += A_cache * B(global_i_idx, device_row_offset + dim + block_dim);
+                                }
+                                temp[internal_i][internal_j] += sum;
                             }
                         }
                     }
 
-                    // apply the (partial) BLAS operation and update C
+                    // apply the (remaining) BLAS operation and update C
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                            const std::size_t partial_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                            const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast<std::size_t>(internal_j);
-
-                            // be sure to not perform out of bounds accesses
-                            if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) {
-                                C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row);
+                            // calculate the indices to access the global data and the data with respect to the current device
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                            const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
+
+                            // be sure to not perform out-of-bounds accesses
+                            if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                                C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx);
                             }
                         }
                     }
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 9571513b9..b734a7c1a 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -28,29 +28,29 @@ namespace plssvm::openmp::detail {
 
 /**
  * @brief Assemble the kernel matrix using the @p kernel function.
- * @tparam kernel the compile-time kernel function to use
+ * @tparam kernel_function the compile-time kernel function to use
  * @tparam Args the types of the potential additional arguments for the @p kernel function
  * @param[out] kernel_matrix the resulting kernel matrix
  * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] q the `q` vector
  * @param[in] QA_cost he bottom right matrix entry multiplied by cost
  * @param[in] cost 1 / the cost parameter in the C-SVM
  * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
  */
-template <kernel_function_type kernel, typename... Args>
-void device_kernel_assembly(real_type*kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
     PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
+    PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+    PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
     PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
 
     // calculate constants
     const std::size_t num_rows = data.num_rows() - 1;
     const std::size_t num_features = data.num_cols();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
@@ -64,46 +64,52 @@ void device_kernel_assembly(real_type*kernel_matrix, const soa_matrix<real_type>
             for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
                 for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) {
                     // calculate the indices used in the current thread
-                    const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t col_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz;
 
                     // only calculate the upper triangular matrix
-                    if (row_idx >= col_idx) {
+                    if (i_idx >= j_idx) {
                         // create a thread private array used for internal caching
                         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
                         // iterate over all features
-                        for (std::size_t dim = 0; dim < num_features; ++dim) {
+                        for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
                             // perform the feature reduction calculation
-                            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    // calculate the indices to access the global data
+                                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
-                                    temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data(global_row, dim), data(global_col, dim));
+                                    real_type sum{ 0.0 };
+                                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                        sum += detail::feature_reduce<kernel_function>(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim));
+                                    }
+                                    temp[internal_j][internal_i] += sum;
                                 }
                             }
                         }
 
                         // apply the remaining part of the kernel function and store the value in the output kernel matrix
-                        for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                            for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                                // calculate the indices to access the kernel matrix (the part stored on the current device)
-                                const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                                const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                                const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                                const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data and the data with respect to the current device
+                                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                                const auto global_j_idx = device_row_offset + device_global_j_idx;
 
-                                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                                if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                                    real_type temp_ij = temp[internal_row][internal_col];
-                                    temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col];
+                                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                                    real_type temp_ij = temp[internal_j][internal_i];
+                                    // apply the final kernel function
+                                    temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                                     // apply the cost on the diagonal
-                                    if (global_row == global_col) {
+                                    if (global_i_idx == global_j_idx) {
                                         temp_ij += cost;
                                     }
-                                    // update the kernel matrix
-                                    kernel_matrix[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij;
+                                    // update the upper triangular kernel matrix
+                                    kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                                 }
                             }
                         }
diff --git a/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp b/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp
index 59fd0f43c..359e2f8ff 100644
--- a/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/kernel_functions.hpp
@@ -27,42 +27,17 @@ namespace plssvm::openmp::detail {
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] inline real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//

From 2e64193492090b25529f4fe4e8c30755375f4461 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 30 May 2025 11:40:23 +0200
Subject: [PATCH 30/93] Improve the performance of the OpenMP cg_implicit
 kernel matrix assembly + BLAS implementation. Align names more to the ones
 used in the other backends.

---
 .../kernel_matrix_assembly_blas.hpp           | 104 +++++++++++-------
 1 file changed, 64 insertions(+), 40 deletions(-)

diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 771689209..60c10de07 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -26,25 +26,25 @@
 namespace plssvm::openmp::detail {
 
 /**
- * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
- * @tparam kernel the compile-time kernel function to use
- * @tparam Args the types of the potential additional arguments for the @p kernel function
+ * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @tparam kernel_function the compile-time kernel function to use
+ * @tparam Args the types of the potential additional arguments for the @p kernel_function function
  * @param[in] alpha the scalar alpha value
  * @param[in] q the `q` vector
  * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] QA_cost he bottom right matrix entry multiplied by cost
  * @param[in] cost 1 / the cost parameter in the C-SVM
  * @param[in] B the matrix @p B
  * @param[in,out] C the matrix @p C
- * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
+ * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_assembly_symm(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+inline void device_kernel_assembly_symm(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
     PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
+    PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+    PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
     PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
     PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!");
     PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size());
@@ -53,8 +53,8 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
     const std::size_t num_rows = data.num_rows() - 1;
     const std::size_t num_features = data.num_cols();
     const std::size_t num_classes = B.num_rows();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
@@ -67,54 +67,78 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
             for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
                 for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) {
                     // calculate the indices used in the current thread
-                    const std::size_t row_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t col_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz;
 
                     // only calculate the upper triangular matrix
-                    if (row_idx >= col_idx) {
+                    if (i_idx >= j_idx) {
                         // create a thread private array used for internal caching
                         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
                         // iterate over all features
-                        for (std::size_t dim = 0; dim < num_features; ++dim) {
-                            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
+                        for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    // calculate the indices to access the global data
+                                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
-                                    temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data(global_row, dim), data(global_col, dim));
+                                    real_type sum{ 0.0 };
+                                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                        sum += detail::feature_reduce<kernel_function>(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim));
+                                    }
+                                    temp[internal_j][internal_i] += sum;
                                 }
                             }
                         }
 
                         // apply the remaining part of the kernel function and store the value in the output kernel matrix
-                        for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                            for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                                const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                                const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                                const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                                const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data and the data with respect to the current device
+                                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                                const auto global_j_idx = device_row_offset + device_global_j_idx;
 
                                 // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                                if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                                    real_type temp_ij = temp[internal_row][internal_col];
-                                    temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col];
+                                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                                    // apply the final kernel function
+                                    temp[internal_j][internal_i] = detail::apply_kernel_function<kernel_function>(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                                     // apply the cost on the diagonal
-                                    if (global_row == global_col) {
-                                        temp_ij += cost;
-                                        // calculate the values of alpha * A * B
-                                        for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
+                                    if (global_i_idx == global_j_idx) {
+                                        temp[internal_j][internal_i] += cost;
+                                    }
+                                } else {
+                                    // be sure to set the value to zero otherwise
+                                    temp[internal_j][internal_i] = real_type{ 0.0 };
+                                }
+                            }
+                        }
+
+                        //*************************************************************************//
+                        //                     calculate C += alpha * temp * B                     //
+                        //*************************************************************************//
+                        for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                                    if (global_i_idx == global_j_idx) {
+                                        // only apply once to the diagonal
+                                        for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
 #pragma omp atomic
-                                            C(class_idx, global_row) += alpha * temp_ij * B(class_idx, global_row);
+                                            C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx);
                                         }
                                     } else {
-                                        // calculate the values of alpha * A * B
-                                        for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
+                                        // apply it for the upper and lower triangular matrix
+                                        for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
 #pragma omp atomic
-                                            C(class_idx, global_row) += alpha * temp_ij * B(class_idx, global_col);
-// symmetry
+                                            C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_j_idx);
+                                            // symmetry
 #pragma omp atomic
-                                            C(class_idx, global_col) += alpha * temp_ij * B(class_idx, global_row);
+                                            C(dim + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx);
                                         }
                                     }
                                 }

From 8aa1c93bbaab8c98310cab60355b539c01f0da66 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 30 May 2025 11:40:41 +0200
Subject: [PATCH 31/93] Improve the performance of the OpenMP predict
 implementation. Align names more to the ones used in the other backends.

---
 .../backends/OpenMP/kernel/predict_kernel.hpp | 199 +++++++++++++-----
 1 file changed, 147 insertions(+), 52 deletions(-)

diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index 407096055..1540397bc 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -31,28 +31,71 @@ namespace plssvm::openmp::detail {
  * @param[out] w the vector to speedup the linear prediction
  * @param[in] alpha the previously learned weights
  * @param[in] support_vectors the support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
+ * @param[in] device_num_sv the number of support vectors the current device is responsible for
  * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for
  */
-inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) {
+inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_num_sv, const std::size_t sv_offset) {
     PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
     PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }));
-    PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows());
+    PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows());
     PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows());
 
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
     const std::size_t num_features = support_vectors.num_cols();
 
-#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(num_classes, num_features, device_specific_num_sv, sv_offset)
-    for (std::size_t a = 0; a < num_classes; ++a) {
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
-            real_type temp{ 0.0 };
-#pragma omp simd reduction(+ : temp)
-            for (std::size_t idx = 0; idx < device_specific_num_sv; ++idx) {
-                temp = std::fma(alpha(a, sv_offset + idx), support_vectors(sv_offset + idx, dim), temp);
+    // calculate constants
+    const auto blocked_num_features = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_features) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
+
+    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+
+#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, sv_offset)
+    for (std::size_t dim = 0; dim < blocked_num_features; dim += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) {
+            // perform operations on the current block
+            for (std::size_t dim_block = 0; dim_block < THREAD_BLOCK_SIZE_uz; ++dim_block) {
+                for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) {
+                    // calculate the indices used in the current thread
+                    const std::size_t feature_idx = (dim + dim_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz;
+
+                    // create a thread private array used for internal caching
+                    std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+                    for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) {
+                        // perform the dot product calculation
+                        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                // calculate the indices to access the global data
+                                const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                                real_type sum{ 0.0 };
+                                for (unsigned block_sv = 0; block_sv < THREAD_BLOCK_SIZE; ++block_sv) {
+                                    sum += alpha(global_class_idx, sv_offset + sv + block_sv) * support_vectors(sv_offset + sv + block_sv, global_feature_idx);
+                                }
+                                temp[internal_class][internal_feature] += sum;
+                            }
+                        }
+                    }
+
+                    // store the result back to the w vector
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                            if (global_class_idx < num_classes && global_feature_idx < num_features) {
+                                w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature];
+                            }
+                        }
+                    }
+                }
             }
-            w(a, dim) = temp;
         }
     }
 }
@@ -63,29 +106,73 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
  * @param[in] w the vector to speedup the calculations
  * @param[in] rho the previously learned bias
  * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
+ * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+ * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
  */
-inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) {
+inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) {
     PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size());
     PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols());
     PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
 
     // calculate constants
     const std::size_t num_classes = prediction.num_cols();
     const std::size_t num_features = predict_points.num_cols();
 
-#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(num_classes, num_features, device_specific_num_predict_points, row_offset)
-    for (std::size_t point_index = 0; point_index < device_specific_num_predict_points; ++point_index) {
-        for (std::size_t a = 0; a < num_classes; ++a) {
-            real_type temp{ 0.0 };
-#pragma omp simd reduction(+ : temp)
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
-                temp = std::fma(w(a, dim), predict_points(row_offset + point_index, dim), temp);
+    // calculate constants
+    const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
+
+    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+
+#pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(blocked_device_num_predict_points, blocked_num_classes, device_num_predict_points, num_classes, num_features, device_row_offset)
+    for (std::size_t point = 0; point < blocked_device_num_predict_points; point += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) {
+            // perform operations on the current block
+            for (std::size_t point_block = 0; point_block < THREAD_BLOCK_SIZE_uz; ++point_block) {
+                for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) {
+                    // calculate the indices used in the current thread
+                    const std::size_t pp_idx = (point + point_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz;
+
+                    // create a thread private array used for internal caching
+                    std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+                    for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+                        // perform the dot product calculation
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                // calculate the indices to access the global data
+                                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                                real_type sum{ 0.0 };
+                                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                    sum += w(global_class_idx, dim + block_dim) * predict_points(global_pp_idx, dim + block_dim);
+                                }
+                                temp[internal_class][internal_pp] += sum;
+                            }
+                        }
+                    }
+
+                    // store the result back to the w vector
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                            const auto global_pp_idx = device_row_offset + device_global_pp_idx;
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                            if (global_class_idx < num_classes && global_pp_idx < device_num_predict_points) {
+                                prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx];
+                            }
+                        }
+                    }
+                }
             }
-            prediction(row_offset + point_index, a) = temp - rho[a];
         }
     }
 }
@@ -99,24 +186,24 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
  * @param[in] rho the previously learned bias
  * @param[in] support_vectors the support vectors
  * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
+ * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+ * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) {
     PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size());
     PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
     PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols());
     PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
 
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
     const std::size_t num_support_vectors = support_vectors.num_rows();
     const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_features = predict_points.num_cols();
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
@@ -124,34 +211,39 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
     const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t point_index = 0; point_index < device_specific_num_predict_points; ++point_index) {
-        for (std::size_t a = 0; a < num_classes; ++a) {
-            prediction(row_offset + point_index, a) -= rho[a];
+    for (std::size_t pp_idx = 0; pp_idx < device_num_predict_points; ++pp_idx) {
+        for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
+            prediction(device_row_offset + pp_idx, class_idx) -= rho[class_idx];
         }
     }
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t pp = 0; pp < blocked_device_specific_num_predict_points; pp += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t sv = 0; sv < blocked_num_support_vectors; sv += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t x_block = 0; x_block < blocked_device_specific_num_predict_points; x_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t y_block = 0; y_block < blocked_num_support_vectors; y_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t pp_block = 0; pp_block < THREAD_BLOCK_SIZE_uz; ++pp_block) {
-                for (std::size_t sv_block = 0; sv_block < THREAD_BLOCK_SIZE_uz; ++sv_block) {
+            for (std::size_t x_thread = 0; x_thread < THREAD_BLOCK_SIZE_uz; ++x_thread) {
+                for (std::size_t y_thread = 0; y_thread < THREAD_BLOCK_SIZE_uz; ++y_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t pp_idx = (pp + pp_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t sv_idx = (sv + sv_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t pp_idx = (x_block + x_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t sv_idx = (y_block + y_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
                     // iterate over all features
-                    for (std::size_t dim = 0; dim < num_features; ++dim) {
+                    for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
                         // perform the feature reduction calculation
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                                const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+                                // calculate the indices to access the global data
+                                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                                const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                                temp[internal_pp][internal_sv] += detail::feature_reduce<kernel>(support_vectors(global_sv_idx, dim), predict_points(global_pp_idx, dim));
+                                real_type sum{ 0.0 };
+                                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                    sum += detail::feature_reduce<kernel_function>(support_vectors(global_sv_idx, dim + block_dim), predict_points(global_pp_idx, dim + block_dim));
+                                }
+                                temp[internal_pp][internal_sv] += sum;
                             }
                         }
                     }
@@ -159,22 +251,25 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                     // update temp using the respective kernel function
                     for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel>(temp[internal_pp][internal_sv], kernel_function_parameter...);
+                            temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter...);
                         }
                     }
 
                     // add results to prediction
-                    for (std::size_t a = 0; a < num_classes; ++a) {
+                    for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) {
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                const std::size_t device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                                const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                                const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+                                // calculate the indices to access the global data and the data with respect to the current device
+                                const auto device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                                const auto global_pp_idx = device_row_offset + device_global_pp_idx;
+                                const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                                // be sure to not perform out of bounds accesses
-                                if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) {
+                                // be sure to not perform out-of-bounds accesses
+                                if (device_global_pp_idx < device_num_predict_points && global_sv_idx < num_support_vectors) {
+                                    for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
 #pragma omp atomic
-                                    prediction(global_pp_idx, a) += alpha(a, global_sv_idx) * temp[internal_pp][internal_sv];
+                                        prediction(global_pp_idx, dim + class_idx) += alpha(dim + class_idx, global_sv_idx) * temp[internal_pp][internal_sv];
+                                    }
                                 }
                             }
                         }

From 51b75b60eeb8431d71b0da2db0de7766ebb536c0 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 30 May 2025 12:21:28 +0200
Subject: [PATCH 32/93] Improve variable names and remove some implicit
 conversions.

---
 .../OpenMP/kernel/cg_explicit/blas.hpp        | 44 +++++++-------
 .../cg_explicit/kernel_matrix_assembly.hpp    | 22 +++----
 .../kernel_matrix_assembly_blas.hpp           | 26 ++++-----
 .../backends/OpenMP/kernel/predict_kernel.hpp | 58 +++++++++----------
 4 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
index ecd80ab1a..298962c19 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
@@ -53,20 +53,20 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
     const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t row = 0; row < blocked_device_specific_num_rows; row += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t row_block = 0; row_block < blocked_device_specific_num_rows; row_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) {
-                for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
+            for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) {
+                for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (rhs_block + rhs_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
                     // iterate over all values
-                    for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) {
+                    for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -75,15 +75,15 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
                                 const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
                                 real_type sum{ 0.0 };
-                                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
                                     real_type A_cache = 0.0;
                                     // determine on which side of the diagonal we are located
-                                    if (dim + block_dim < global_j_idx) {
-                                        A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim + block_dim) * (dim + block_dim + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                    if (dim_block + dim < global_j_idx) {
+                                        A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];
                                     } else {
-                                        A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + block_dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                        A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
                                     }
-                                    sum += A_cache * B(global_i_idx, dim + block_dim + device_row_offset);
+                                    sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset);
                                 }
                                 temp[internal_i][internal_j] += sum;
                             }
@@ -141,20 +141,20 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
     const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t rhs = 0; rhs < blocked_num_rhs; rhs += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t row = 0; row < blocked_num_mirror_rows; row += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t row_block = 0; row_block < blocked_num_mirror_rows; row_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t rhs_block = 0; rhs_block < THREAD_BLOCK_SIZE_uz; ++rhs_block) {
-                for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
+            for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) {
+                for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t i_idx = (rhs + rhs_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t j_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (rhs_block + rhs_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
                     // iterate over the remaining values
-                    for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) {
+                    for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -163,9 +163,9 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
                                 const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
                                 real_type sum{ 0.0 };
-                                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                                    const real_type A_cache = A[(dim + block_dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + block_dim - std::size_t{ 1 }) * (dim + block_dim) / std::size_t{ 2 } + device_num_rows - dim + block_dim + global_j_idx];
-                                    sum += A_cache * B(global_i_idx, device_row_offset + dim + block_dim);
+                                for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                                    const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - dim_block + dim + global_j_idx];
+                                    sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
                                 }
                                 temp[internal_i][internal_j] += sum;
                             }
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
index b734a7c1a..f384645b1 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -58,14 +58,14 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type
     const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2) schedule(dynamic)
-    for (std::size_t row = 0; row < blocked_row_range; row += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t col = 0; col < blocked_device_specific_num_rows; col += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t col_block = 0; col_block < blocked_device_specific_num_rows; col_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
-                for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) {
+            for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
+                for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (col_block + col_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // only calculate the upper triangular matrix
                     if (i_idx >= j_idx) {
@@ -73,7 +73,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type
                         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
                         // iterate over all features
-                        for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+                        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                             // perform the feature reduction calculation
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -82,10 +82,10 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type
                                     const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
                                     real_type sum{ 0.0 };
-                                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                                        sum += detail::feature_reduce<kernel_function>(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim));
+                                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                        sum += detail::feature_reduce<kernel_function>(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature));
                                     }
-                                    temp[internal_j][internal_i] += sum;
+                                    temp[internal_i][internal_j] += sum;
                                 }
                             }
                         }
@@ -101,7 +101,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type
 
                                 // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
                                 if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
-                                    real_type temp_ij = temp[internal_j][internal_i];
+                                    real_type temp_ij = temp[internal_i][internal_j];
                                     // apply the final kernel function
                                     temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                                     // apply the cost on the diagonal
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 60c10de07..3ca4e4dc6 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -61,14 +61,14 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
     const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2) schedule(dynamic)
-    for (std::size_t row = 0; row < blocked_row_range; row += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t col = 0; col < blocked_device_specific_num_rows; col += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t col_block = 0; col_block < blocked_device_specific_num_rows; col_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t row_block = 0; row_block < THREAD_BLOCK_SIZE_uz; ++row_block) {
-                for (std::size_t col_block = 0; col_block < THREAD_BLOCK_SIZE_uz; ++col_block) {
+            for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
+                for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t i_idx = (row + row_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t j_idx = (col + col_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t i_idx = (row_block + row_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t j_idx = (col_block + col_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // only calculate the upper triangular matrix
                     if (i_idx >= j_idx) {
@@ -76,7 +76,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
                         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
                         // iterate over all features
-                        for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+                        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                                     // calculate the indices to access the global data
@@ -84,8 +84,8 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
                                     const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
                                     real_type sum{ 0.0 };
-                                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                                        sum += detail::feature_reduce<kernel_function>(data(global_i_idx, dim + block_dim), data(global_j_idx, dim + block_dim));
+                                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                        sum += detail::feature_reduce<kernel_function>(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature));
                                     }
                                     temp[internal_j][internal_i] += sum;
                                 }
@@ -119,7 +119,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
                         //*************************************************************************//
                         //                     calculate C += alpha * temp * B                     //
                         //*************************************************************************//
-                        for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) {
+                        for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                                     const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
@@ -129,16 +129,16 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
                                         // only apply once to the diagonal
                                         for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
 #pragma omp atomic
-                                            C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx);
+                                            C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);
                                         }
                                     } else {
                                         // apply it for the upper and lower triangular matrix
                                         for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
 #pragma omp atomic
-                                            C(dim + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_j_idx);
+                                            C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx);
                                             // symmetry
 #pragma omp atomic
-                                            C(dim + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(dim + class_idx, global_i_idx);
+                                            C(class_block + class_idx, global_j_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);
                                         }
                                     }
                                 }
diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index 1540397bc..49d98d4da 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -53,19 +53,19 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
     const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, sv_offset)
-    for (std::size_t dim = 0; dim < blocked_num_features; dim += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t feature_block = 0; feature_block < blocked_num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t dim_block = 0; dim_block < THREAD_BLOCK_SIZE_uz; ++dim_block) {
-                for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) {
+            for (std::size_t feature_thread = 0; feature_thread < THREAD_BLOCK_SIZE_uz; ++feature_thread) {
+                for (std::size_t class_thread = 0; class_thread < THREAD_BLOCK_SIZE_uz; ++class_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t feature_idx = (dim + dim_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t feature_idx = (feature_block + feature_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t class_idx = (class_block + class_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) {
+                    for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
@@ -74,8 +74,8 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
                                 const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
                                 real_type sum{ 0.0 };
-                                for (unsigned block_sv = 0; block_sv < THREAD_BLOCK_SIZE; ++block_sv) {
-                                    sum += alpha(global_class_idx, sv_offset + sv + block_sv) * support_vectors(sv_offset + sv + block_sv, global_feature_idx);
+                                for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                                    sum += alpha(global_class_idx, sv_offset + sv_block + sv) * support_vectors(sv_offset + sv_block + sv, global_feature_idx);
                                 }
                                 temp[internal_class][internal_feature] += sum;
                             }
@@ -129,19 +129,19 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
     const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(blocked_device_num_predict_points, blocked_num_classes, device_num_predict_points, num_classes, num_features, device_row_offset)
-    for (std::size_t point = 0; point < blocked_device_num_predict_points; point += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t a = 0; a < blocked_num_classes; a += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t point_block = 0; point_block < THREAD_BLOCK_SIZE_uz; ++point_block) {
-                for (std::size_t a_block = 0; a_block < THREAD_BLOCK_SIZE_uz; ++a_block) {
+            for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) {
+                for (std::size_t class_thread = 0; class_thread < THREAD_BLOCK_SIZE_uz; ++class_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t pp_idx = (point + point_block) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t class_idx = (a + a_block) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t pp_idx = (pp_block + pp_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t class_idx = (class_block + class_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+                    for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
@@ -150,8 +150,8 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
                                 const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
                                 real_type sum{ 0.0 };
-                                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                                    sum += w(global_class_idx, dim + block_dim) * predict_points(global_pp_idx, dim + block_dim);
+                                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                    sum += w(global_class_idx, feature_block + feature) * predict_points(global_pp_idx, feature_block + feature);
                                 }
                                 temp[internal_class][internal_pp] += sum;
                             }
@@ -218,20 +218,20 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
     }
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t x_block = 0; x_block < blocked_device_specific_num_predict_points; x_block += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t y_block = 0; y_block < blocked_num_support_vectors; y_block += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t pp_block = 0; pp_block < blocked_device_specific_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t sv_block = 0; sv_block < blocked_num_support_vectors; sv_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
-            for (std::size_t x_thread = 0; x_thread < THREAD_BLOCK_SIZE_uz; ++x_thread) {
-                for (std::size_t y_thread = 0; y_thread < THREAD_BLOCK_SIZE_uz; ++y_thread) {
+            for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) {
+                for (std::size_t sv_thread = 0; sv_thread < THREAD_BLOCK_SIZE_uz; ++sv_thread) {
                     // calculate the indices used in the current thread
-                    const std::size_t pp_idx = (x_block + x_thread) * INTERNAL_BLOCK_SIZE_uz;
-                    const std::size_t sv_idx = (y_block + y_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t pp_idx = (pp_block + pp_thread) * INTERNAL_BLOCK_SIZE_uz;
+                    const std::size_t sv_idx = (sv_block + sv_thread) * INTERNAL_BLOCK_SIZE_uz;
 
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
                     // iterate over all features
-                    for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+                    for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the feature reduction calculation
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
@@ -240,8 +240,8 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                                 const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
                                 real_type sum{ 0.0 };
-                                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                                    sum += detail::feature_reduce<kernel_function>(support_vectors(global_sv_idx, dim + block_dim), predict_points(global_pp_idx, dim + block_dim));
+                                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature));
                                 }
                                 temp[internal_pp][internal_sv] += sum;
                             }
@@ -256,7 +256,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                     }
 
                     // add results to prediction
-                    for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) {
+                    for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
                                 // calculate the indices to access the global data and the data with respect to the current device
@@ -268,7 +268,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                                 if (device_global_pp_idx < device_num_predict_points && global_sv_idx < num_support_vectors) {
                                     for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
 #pragma omp atomic
-                                        prediction(global_pp_idx, dim + class_idx) += alpha(dim + class_idx, global_sv_idx) * temp[internal_pp][internal_sv];
+                                        prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv];
                                     }
                                 }
                             }

From 8a570a8adc102e9d2267d5e328b6f9a9a16bb3a8 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 30 May 2025 14:39:58 +0200
Subject: [PATCH 33/93] Fix tests after slight API changes.

---
 tests/backends/generic_base_csvm_tests.hpp | 20 ++++++--
 tests/backends/generic_csvm_tests.hpp      | 55 ++++++++++------------
 2 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/tests/backends/generic_base_csvm_tests.hpp b/tests/backends/generic_base_csvm_tests.hpp
index fd93963c1..f6c95038a 100644
--- a/tests/backends/generic_base_csvm_tests.hpp
+++ b/tests/backends/generic_base_csvm_tests.hpp
@@ -41,6 +41,7 @@
 
 #include <cmath>    // std::sqrt, std::abs
 #include <cstddef>  // std::size_t
+#include <cstring>  // std::memcpy
 #include <limits>   // std::numeric_limits::epsilon
 #include <memory>   // std::unique_ptr, std::make_unique
 #include <tuple>    // std::ignore, std::tuple, std::make_tuple
@@ -86,7 +87,10 @@ template <typename csvm_type, typename device_ptr_type, typename matrix_type, ty
 
     if constexpr (plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::hpx) {
         // only a single device for OpenMP, stdpar, and HPX on the CPU
-        result[0] = plssvm::detail::move_only_any{ calculate_partial_kernel_matrix(0, matr.num_rows()) };
+        const std::vector<real_type> partial_kernel_matrix = calculate_partial_kernel_matrix(0, matr.num_rows());
+        auto ptr = std::make_unique<real_type[]>(partial_kernel_matrix.size());
+        std::memcpy(ptr.get(), partial_kernel_matrix.data(), partial_kernel_matrix.size() * sizeof(real_type));
+        result[0] = plssvm::detail::move_only_any{ std::move(ptr) };
     } else {
         for (std::size_t device_id = 0; device_id < csvm.num_available_devices(); ++device_id) {
             auto &device = csvm.devices_[device_id];
@@ -850,7 +854,8 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) {
     const mock_csvm_type svm = util::construct_from_tuple<mock_csvm_type>(params, csvm_test_type::additional_arguments);
     const std::size_t num_devices = svm.num_available_devices();
     // be sure to use the correct data distribution
-    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices);
+    const plssvm::detail::triangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices };
+    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(dist);
 
     // automatic solver type not permitted
     if constexpr (solver == plssvm::solver_type::automatic) {
@@ -880,7 +885,9 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix_minimal) {
                 // get result based on used backend
                 std::vector<plssvm::real_type> kernel_matrix{};
                 if constexpr (plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::hpx) {
-                    kernel_matrix = plssvm::detail::move_only_any_cast<std::vector<plssvm::real_type>>(kernel_matrix_d[device_id]);  // std::vector
+                    const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast<const std::unique_ptr<plssvm::real_type[]> &>(kernel_matrix_d[device_id]);  // std::unique_ptr<plssvm::real_type[]>
+                    kernel_matrix.resize(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));
+                    std::memcpy(kernel_matrix.data(), kernel_matrix_d_ptr.get(), kernel_matrix.size() * sizeof(plssvm::real_type));
                 } else {
                     const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast<const device_ptr_type &>(kernel_matrix_d[device_id]);  // device_ptr -> convert it to a std::vector
                     kernel_matrix.resize(kernel_matrix_d_ptr.size_padded());
@@ -960,7 +967,8 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) {
     const mock_csvm_type svm = util::construct_from_tuple<mock_csvm_type>(params, csvm_test_type::additional_arguments);
     const std::size_t num_devices = svm.num_available_devices();
     // be sure to use the correct data distribution
-    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices);
+    const plssvm::detail::triangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_rows() - 1, num_devices };
+    svm.data_distribution_ = std::make_unique<plssvm::detail::triangular_data_distribution>(dist);
 
     // automatic solver type not permitted
     if constexpr (solver == plssvm::solver_type::automatic) {
@@ -990,7 +998,9 @@ TYPED_TEST_P(GenericCSVMSolverKernelFunction, assemble_kernel_matrix) {
                 // get result based on used backend
                 std::vector<plssvm::real_type> kernel_matrix{};
                 if constexpr (plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::openmp || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::stdpar || plssvm::csvm_to_backend_type_v<csvm_type> == plssvm::backend_type::hpx) {
-                    kernel_matrix = plssvm::detail::move_only_any_cast<std::vector<plssvm::real_type>>(kernel_matrix_d[device_id]);  // std::vector
+                    const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast<const std::unique_ptr<plssvm::real_type[]> &>(kernel_matrix_d[device_id]);  // std::unique_ptr<plssvm::real_type[]>
+                    kernel_matrix.resize(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));
+                    std::memcpy(kernel_matrix.data(), kernel_matrix_d_ptr.get(), kernel_matrix.size() * sizeof(plssvm::real_type));
                 } else {
                     const auto &kernel_matrix_d_ptr = plssvm::detail::move_only_any_cast<const device_ptr_type &>(kernel_matrix_d[device_id]);  // device_ptr -> convert it to a std::vector
                     kernel_matrix.resize(kernel_matrix_d_ptr.size_padded());
diff --git a/tests/backends/generic_csvm_tests.hpp b/tests/backends/generic_csvm_tests.hpp
index 84b9b7ad9..549cd3a68 100644
--- a/tests/backends/generic_csvm_tests.hpp
+++ b/tests/backends/generic_csvm_tests.hpp
@@ -81,14 +81,15 @@ TYPED_TEST_P(GenericBackendCSVM, blas_level_3_kernel_explicit) {
 
         const std::size_t specific_num_rows = dist.place_specific_num_rows(device);
         const std::size_t row_offset = dist.place_row_offset(device);
-        device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_temp);
+        device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_temp);
         const std::size_t num_mirror_rows = num_rows - row_offset - specific_num_rows;
         if (num_mirror_rows > 0) {
-            device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_temp);
+            device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_temp);
         }
 
         C_res += C_temp;
     }
+    C_res.restore_padding();
 
     // calculate correct results
     const plssvm::aos_matrix<plssvm::real_type> kernel_matrix_gemm_padded = ground_truth::assemble_full_kernel_matrix(params, data.data(), q_red, QA_cost);
@@ -112,6 +113,7 @@ TYPED_TEST_P(GenericBackendCSVM, calculate_w) {
     const plssvm::detail::rectangular_data_distribution dist{ plssvm::mpi::communicator{}, data.num_data_points(), 1 };
 
     device_kernel_w_linear(w, weights, data.data(), dist.place_specific_num_rows(0), dist.place_row_offset(0));
+    w.restore_padding();
 
     // calculate correct results
     const plssvm::soa_matrix<plssvm::real_type> correct_w = ground_truth::calculate_w(weights, data.data());
@@ -160,22 +162,22 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, assemble_kernel_matrix_explicit)
 
     switch (kernel) {
         case plssvm::kernel_function_type::linear:
-            device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
+            device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
             break;
         case plssvm::kernel_function_type::polynomial:
-            device_kernel_assembly<plssvm::kernel_function_type::polynomial>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_assembly<plssvm::kernel_function_type::polynomial>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::rbf:
-            device_kernel_assembly<plssvm::kernel_function_type::rbf>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly<plssvm::kernel_function_type::rbf>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::sigmoid:
-            device_kernel_assembly<plssvm::kernel_function_type::sigmoid>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_assembly<plssvm::kernel_function_type::sigmoid>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::laplacian:
-            device_kernel_assembly<plssvm::kernel_function_type::laplacian>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly<plssvm::kernel_function_type::laplacian>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::chi_squared:
-            device_kernel_assembly<plssvm::kernel_function_type::chi_squared>(kernel_matrix, data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly<plssvm::kernel_function_type::chi_squared>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
             break;
     }
     const std::vector<plssvm::real_type> correct_kernel_matrix = ground_truth::assemble_device_specific_kernel_matrix(params, data_matr, q_red, QA_cost, dist, 0);
@@ -297,6 +299,7 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, predict_values) {
             device_kernel_predict<plssvm::kernel_function_type::chi_squared>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
             break;
     }
+    out.restore_padding();
 
     // check out for correctness
     const plssvm::aos_matrix<plssvm::real_type> correct_out = ground_truth::predict_values(params, correct_w, weights, rho, data_matr, predict_points);
@@ -337,45 +340,39 @@ TYPED_TEST_P(GenericBackendCSVMDeathTest, blas_level_3_kernel_explicit) {
     const std::size_t row_offset = dist.place_row_offset(0);
 
     {
-        // the A matrix must have the correct size
-        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, std::vector<plssvm::real_type>{}, B, beta, C), "A matrix may not be empty!");
-
         // the B matrix must have the correct shape
         const auto B_wrong = util::generate_random_matrix<plssvm::soa_matrix<plssvm::real_type>>(plssvm::shape{ std::min<std::size_t>(0ULL, num_rows - 1), std::min<std::size_t>(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE });
-        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
+        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
 
         // the C matrix must have the correct shape
         auto C_wrong = util::generate_random_matrix<plssvm::soa_matrix<plssvm::real_type>>(plssvm::shape{ std::min<std::size_t>(0ULL, num_rows - 1), std::min<std::size_t>(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE });
-        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
+        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
 
         // the place specific number of rows may not be too large
-        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, num_rows + 1, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
+        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, num_rows + 1, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
 
         // the row offset may not be too large
-        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, num_rows + 1, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
+        EXPECT_DEATH(device_kernel_symm(num_rows, num_rhs, specific_num_rows, num_rows + 1, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
     }
     {
         const std::size_t num_mirror_rows = num_rows - row_offset - specific_num_rows;
 
-        // the A matrix must have the correct size
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, std::vector<plssvm::real_type>{}, B, beta, C), "A matrix may not be empty!");
-
         // the B matrix must have the correct shape
         const auto B_wrong = util::generate_random_matrix<plssvm::soa_matrix<plssvm::real_type>>(plssvm::shape{ std::min<std::size_t>(0ULL, num_rows - 1), std::min<std::size_t>(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE });
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
+        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B_wrong, beta, C), ::testing::HasSubstr(fmt::format("B matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
 
         // the C matrix must have the correct shape
         auto C_wrong = util::generate_random_matrix<plssvm::soa_matrix<plssvm::real_type>>(plssvm::shape{ std::min<std::size_t>(0ULL, num_rows - 1), std::min<std::size_t>(0ULL, num_rhs - 2) }, plssvm::shape{ plssvm::PADDING_SIZE, plssvm::PADDING_SIZE });
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
+        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C_wrong), ::testing::HasSubstr(fmt::format("C matrix sizes mismatch!: [{}, {}] != [{}, {}]", std::min(0, static_cast<int>(num_rows) - 1), std::min(0, static_cast<int>(num_rhs) - 2), num_rows, num_rhs)));
 
         // the place specific number of rows may not be too large
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, num_rows + 1, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
+        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, num_rows + 1, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
 
         // the mirror number of rows may not be too large
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_rows + 1, specific_num_rows, row_offset, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
+        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_rows + 1, specific_num_rows, row_offset, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
 
         // the row offset may not be too large
-        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, num_rows + 1, alpha, kernel_matrix, B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
+        EXPECT_DEATH(device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, specific_num_rows, num_rows + 1, alpha, kernel_matrix.data(), B, beta, C), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", num_rows + 1, num_rows)));
     }
 }
 
@@ -445,22 +442,22 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, assemble_kernel_matrix_e
     const auto run_assembly = [=](const plssvm::parameter &params_p, std::vector<plssvm::real_type> &kernel_matrix_p, const plssvm::soa_matrix<plssvm::real_type> &data_p, const std::size_t device_specific_num_rows_p, const std::size_t row_offset_p, const std::vector<plssvm::real_type> &q_red_p, const plssvm::real_type QA_cost_p) {
         switch (kernel) {
             case plssvm::kernel_function_type::linear:
-                device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost);
+                device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost);
                 break;
             case plssvm::kernel_function_type::polynomial:
-                device_kernel_assembly<plssvm::kernel_function_type::polynomial>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly<plssvm::kernel_function_type::polynomial>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::rbf:
-                device_kernel_assembly<plssvm::kernel_function_type::rbf>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::rbf>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::sigmoid:
-                device_kernel_assembly<plssvm::kernel_function_type::sigmoid>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly<plssvm::kernel_function_type::sigmoid>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::laplacian:
-                device_kernel_assembly<plssvm::kernel_function_type::laplacian>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::laplacian>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::chi_squared:
-                device_kernel_assembly<plssvm::kernel_function_type::chi_squared>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::chi_squared>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
         }
     };

From 3025c7606fd3cb67c63f543a28077fe189eda991 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 30 May 2025 15:02:20 +0200
Subject: [PATCH 34/93] Remove unnecessary conditions. Improve variable naming.

---
 .../OpenMP/kernel/cg_explicit/blas.hpp        |  8 ++++----
 .../backends/OpenMP/kernel/predict_kernel.hpp | 20 ++++++-------------
 src/plssvm/backends/OpenMP/csvm.cpp           |  6 ++++++
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
index 298962c19..81f560421 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
@@ -45,7 +45,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
 
     // calculate constants
     const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
@@ -54,7 +54,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
 
 #pragma omp parallel for collapse(2)
     for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t row_block = 0; row_block < blocked_device_specific_num_rows; row_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t row_block = 0; row_block < blocked_device_num_rows; row_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
             for (std::size_t rhs_thread = 0; rhs_thread < THREAD_BLOCK_SIZE_uz; ++rhs_thread) {
                 for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
@@ -83,7 +83,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
                                     } else {
                                         A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
                                     }
-                                    sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset);
+                                    sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
                                 }
                                 temp[internal_i][internal_j] += sum;
                             }
@@ -164,7 +164,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
 
                                 real_type sum{ 0.0 };
                                 for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
-                                    const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - dim_block + dim + global_j_idx];
+                                    const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx];
                                     sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
                                 }
                                 temp[internal_i][internal_j] += sum;
diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index 49d98d4da..89c0a380c 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -89,9 +89,7 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
                             const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
                             const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                            if (global_class_idx < num_classes && global_feature_idx < num_features) {
-                                w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature];
-                            }
+                            w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature];
                         }
                     }
                 }
@@ -162,13 +160,10 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
                     for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
                             // calculate the indices to access the global data
-                            const auto device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                            const auto global_pp_idx = device_row_offset + device_global_pp_idx;
+                            const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
                             const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                            if (global_class_idx < num_classes && global_pp_idx < device_num_predict_points) {
-                                prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx];
-                            }
+                            prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx];
                         }
                     }
                 }
@@ -260,16 +255,13 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
                                 // calculate the indices to access the global data and the data with respect to the current device
-                                const auto device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                                const auto global_pp_idx = device_row_offset + device_global_pp_idx;
+                                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
                                 const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
                                 // be sure to not perform out-of-bounds accesses
-                                if (device_global_pp_idx < device_num_predict_points && global_sv_idx < num_support_vectors) {
-                                    for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
 #pragma omp atomic
-                                        prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv];
-                                    }
+                                    prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv];
                                 }
                             }
                         }
diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp
index 656d966f3..d34b25066 100644
--- a/src/plssvm/backends/OpenMP/csvm.cpp
+++ b/src/plssvm/backends/OpenMP/csvm.cpp
@@ -275,6 +275,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                 break;
         }
     }
+    // restore padding entries by setting them to zero
+    C.restore_padding();
 }
 
 //***************************************************//
@@ -330,6 +332,8 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
                 [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration }));
             }
+            // restore padding entries by setting them to zero
+            w.restore_padding();
 
             // reduce w on all MPI ranks
             comm_.allreduce_inplace(w);
@@ -369,6 +373,8 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "predict_kernel", duration }));
     }
 
+    // restore padding entries by setting them to zero
+    out.restore_padding();
     return out;
 }
 

From 46a955806df29922477ac79cd5fea07cb0329f3d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 30 May 2025 15:19:14 +0200
Subject: [PATCH 35/93] Update variable names.

---
 .../backends/CUDA/kernel/cg_explicit/blas.cuh | 24 +++++------
 .../cg_explicit/kernel_matrix_assembly.cuh    | 12 +++---
 .../kernel_matrix_assembly_blas.cuh           | 26 ++++++------
 .../backends/CUDA/kernel/predict_kernel.cuh   | 40 +++++++++----------
 4 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
index 1a6be4ae8..d2adc5618 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
@@ -58,7 +58,7 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t
         const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
         // iterate over all values using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < (num_rows - device_row_offset); dim += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 // calculate the indices to access the global data, pays attention to coalesced memory accesses
@@ -67,20 +67,20 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t
 
                 // store the values in the shared memory
                 // determine on which side of the diagonal we are located
-                if (dim + threadIdx_y < global_j_idx_linear) {
-                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                if (dim_block + threadIdx_y < global_j_idx_linear) {
+                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                 } else {
-                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                 }
-                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+                        temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
                     }
                 }
             }
@@ -150,7 +150,7 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std:
         const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_mirror_rows
 
         // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_num_rows; dim += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 // calculate the indices to access the global data, pays attention to coalesced memory accesses
@@ -158,16 +158,16 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std:
                 const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                 // store the values in the shared memory
-                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim + threadIdx_y) + global_j_idx_linear];  // SoA, upper triangular matrix only
-                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                             // SoA
+                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                               // SoA
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+                        temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
                     }
                 }
             }
diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
index e4a3fa22d..70c9b4101 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/kernel_matrix_assembly.cuh
@@ -67,7 +67,7 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type
             const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                     // calculate the indices to access the global data, pays attention to coalesced memory accesses
@@ -75,17 +75,17 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type
                     const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                     // store the values in the shared memory
-                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
-                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                    data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
                         }
                     }
                 }
diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
index 8e8dd03c2..960f61b9f 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
@@ -80,7 +80,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
             auto data_j_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                     // calculate the indices to access the global data, pays attention to coalesced memory accesses
@@ -88,17 +88,17 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
                     const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                     // store the values in the shared memory
-                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
-                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                    data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
                         }
                     }
                 }
@@ -139,15 +139,15 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
             auto C_out_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) {
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                     // calculate the indices to access the global data, pays attention to coalesced memory accesses
                     const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                     // store the values in the shared memory
-                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y];  // SoA
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 };                                                                  // SoA
+                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
+                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 };                                                                          // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
@@ -167,7 +167,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
                     // calculate the indices to access the global data
                     const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal);
 
-                    atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]);  // SoA
+                    atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]);  // SoA
                 }
                 __syncthreads();  // wai until all threads updated C with their values
             }
@@ -195,14 +195,14 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
             auto C_out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) {
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                     // calculate the indices to access the global data, pays attention to coalesced memory accesses
                     const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                     // store the values in the shared memory
-                    B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y];  // SoA
+                    B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
                     C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
@@ -223,7 +223,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
                     // calculate the indices to access the global data
                     const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal);
 
-                    atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);  // SoA
+                    atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);  // SoA
                 }
                 __syncthreads();  // wait until all threads updated C with their values
             }
diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
index 5469b01d9..9c462127e 100644
--- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
+++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
@@ -58,7 +58,7 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con
         const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;    // num_classes
 
         // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t sv = 0; sv < device_num_sv; sv += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 // calculate the indices to access the global data, pays attention to coalesced memory accesses
@@ -66,16 +66,16 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con
                 const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                 // store the values in the shared memory
-                feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv + threadIdx_y];  // SoA
-                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv + sv_offset + threadIdx_y];           // AoS
+                feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y];  // SoA
+                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + sv_offset + threadIdx_y];           // AoS
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+            for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
                 for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                     for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                        temp[internal_feature][internal_class] += alpha_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature];
+                        temp[internal_feature][internal_class] += alpha_cache[sv][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature];
                     }
                 }
             }
@@ -137,7 +137,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t
         const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_classes
 
         // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 // calculate the indices to access the global data, pays attention to coalesced memory accesses
@@ -145,16 +145,16 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t
                 const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                 // store the values in the shared memory
-                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
-                w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
                 for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                        temp[internal_pp][internal_class] += w_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp];
+                        temp[internal_pp][internal_class] += w_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp];
                     }
                 }
             }
@@ -226,7 +226,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
         const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
 
         // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 // calculate the indices to access the global data, pays attention to coalesced memory accesses
@@ -234,17 +234,17 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
                 const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
 
                 // store the values in the shared memory
-                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
-                sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];                          // SoA
+                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];                          // SoA
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
                 for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                  pp_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]);
+                        temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                  pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]);
                     }
                 }
             }
@@ -271,17 +271,17 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
             const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes; dim += THREAD_BLOCK_SIZE_uz) {
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                     // calculate the indices to access the global data, pays attention to coalesced memory accesses
                     const std::size_t global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
 
                     // store the values in the shared memory
-                    alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
+                    alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                     // the bias (rho) must only be applied once for all support vectors
                     if (blockIdx_y == std::size_t{ 0 }) {
-                        out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[dim + threadIdx_y];
+                        out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y];
                     } else {
                         out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
                     }
@@ -304,7 +304,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
                     // calculate the indices to access the global data
                     const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                    atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
+                    atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
                 }
                 __syncthreads();  // wait until all threads updated their part of the prediction
             }

From 0c682067bd8a86734bc94c7c5f2c9567a7133aef Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 14:44:36 +0200
Subject: [PATCH 36/93] Update documentation and add missing headers.

---
 .../backends/CUDA/kernel/cg_explicit/blas.cuh |  2 +
 .../kernel_matrix_assembly_blas.cuh           |  3 +
 .../backends/CUDA/kernel/kernel_functions.cuh |  1 -
 .../backends/CUDA/kernel/predict_kernel.cuh   | 84 +++++++++----------
 .../backends/OpenMP/kernel/predict_kernel.hpp | 10 +--
 5 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
index d2adc5618..bacc84852 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
@@ -15,6 +15,8 @@
 
 #include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::cuda::detail {
 
 /**
diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
index 960f61b9f..bf1ee66e5 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
@@ -18,6 +18,8 @@
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::cuda::detail {
 
 /**
@@ -186,6 +188,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
                 }
             }
         }
+
         //*************************************************************************//
         //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
         //*************************************************************************//
diff --git a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh
index 72a4499ae..7748c45c8 100644
--- a/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh
+++ b/include/plssvm/backends/CUDA/kernel/kernel_functions.cuh
@@ -51,7 +51,6 @@ template <>
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
index 9c462127e..285cdc3a6 100644
--- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
+++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
@@ -18,21 +18,23 @@
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::cuda::detail {
 
 /**
  * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @param[out] w the vector to speedup the linear prediction
  * @param[in] alpha the previously learned weights
- * @param[in] sv the support vectors
+ * @param[in] support_vectors the support vectors
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
  * @param[in] device_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first support vector (row in @p alpha) the current device is responsible for
+ * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
     const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
@@ -67,7 +69,7 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con
 
                 // store the values in the shared memory
                 feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y];  // SoA
-                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + sv_offset + threadIdx_y];           // AoS
+                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y];    // AoS
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
@@ -264,50 +266,48 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
         auto alpha_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
         auto out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
-        {
-            // calculate the indices used in the current thread
-            const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
-            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
-            const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
-
-            // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
-                // load data into shared memory
-                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
-                    const std::size_t global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
-
-                    // store the values in the shared memory
-                    alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
-                    // the bias (rho) must only be applied once for all support vectors
-                    if (blockIdx_y == std::size_t{ 0 }) {
-                        out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y];
-                    } else {
-                        out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
-                    }
+        // calculate the indices used in the current thread
+        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+        // iterate over all classes using blocking to be able to cache them for faster memory accesses
+        for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
+
+                // store the values in the shared memory
+                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
+                // the bias (rho) must only be applied once for all support vectors
+                if (blockIdx_y == std::size_t{ 0 }) {
+                    out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y];
+                } else {
+                    out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
                 }
-                __syncthreads();  // wait until all threads loaded their part of the data
-
-                // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
-                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
-                        for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] +=
-                                temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv];
-                        }
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // calculate intermediate results and store them in shared memory
+            for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                        out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] +=
+                            temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv];
                     }
-                    __syncthreads();  // wait until all threads performed their part of the calculations
                 }
+                __syncthreads();  // wait until all threads performed their part of the calculations
+            }
 
-                // atomically add the intermediate cached results to the prediction
-                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    // calculate the indices to access the global data
-                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
+            // atomically add the intermediate cached results to the prediction
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                    atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
-                }
-                __syncthreads();  // wait until all threads updated their part of the prediction
+                atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
             }
+            __syncthreads();  // wait until all threads updated their part of the prediction
         }
     }
 }
diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index 89c0a380c..17696bd90 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -32,13 +32,13 @@ namespace plssvm::openmp::detail {
  * @param[in] alpha the previously learned weights
  * @param[in] support_vectors the support vectors
  * @param[in] device_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for
+ * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for
  */
-inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_num_sv, const std::size_t sv_offset) {
+inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) {
     PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
     PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }));
     PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows());
-    PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows());
+    PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows());
 
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
@@ -52,7 +52,7 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
     const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
-#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, sv_offset)
+#pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, device_sv_offset)
     for (std::size_t feature_block = 0; feature_block < blocked_num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
         for (std::size_t class_block = 0; class_block < blocked_num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
@@ -75,7 +75,7 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
 
                                 real_type sum{ 0.0 };
                                 for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
-                                    sum += alpha(global_class_idx, sv_offset + sv_block + sv) * support_vectors(sv_offset + sv_block + sv, global_feature_idx);
+                                    sum += alpha(global_class_idx, device_sv_offset + sv_block + sv) * support_vectors(device_sv_offset + sv_block + sv, global_feature_idx);
                                 }
                                 temp[internal_class][internal_feature] += sum;
                             }

From a9e827105544f5d65633cf441eab227ebbc64145 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 14:45:38 +0200
Subject: [PATCH 37/93] Update the HIP backend kernels.

---
 .../HIP/kernel/cg_explicit/blas.hip.hpp       | 297 +++++++--------
 .../kernel_matrix_assembly.hip.hpp            |  52 +--
 .../kernel_matrix_assembly_blas.hip.hpp       | 180 ++++-----
 .../HIP/kernel/kernel_functions.hip.hpp       |  35 +-
 .../HIP/kernel/predict_kernel.hip.hpp         | 342 +++++++++---------
 5 files changed, 463 insertions(+), 443 deletions(-)

diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp
index 124688d3a..b2e9c8ce3 100644
--- a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp
@@ -13,11 +13,13 @@
 #define PLSSVM_BACKENDS_HIP_CG_EXPLICIT_BLAS_HIP_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::hip::detail {
 
 /**
@@ -25,8 +27,8 @@ namespace plssvm::hip::detail {
  * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for!
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -35,78 +37,77 @@ namespace plssvm::hip::detail {
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_symm(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # rhs -> num_rhs
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # rows -> device_specific_num_rows
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_i = i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_j = j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // determine on which side of the diagonal we are located
-            if (dim + threadIdx_y < global_j) {
-                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + 1ull) / 2ull];
-            } else {
-                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y - global_j * (global_j + 1ull) / 2ull];
-            }
-            // determine on which side of the diagonal we are located
-            if (dim + threadIdx.y + THREAD_BLOCK_SIZE < global_j) {
-                A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull + 1ull) / 2ull];
-            } else {
-                A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - global_j * (global_j + 1ull) / 2ull];
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+        // iterate over all values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                // determine on which side of the diagonal we are located
+                if (dim_block + threadIdx_y < global_j_idx_linear) {
+                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                } else {
+                    A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                }
+                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
             }
-
-            B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i];
-            B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
-
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
+    // calculate the indices used in the current thread
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
     // apply the (partial) BLAS operation and update C
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-            const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
-
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && device_global_j < device_specific_num_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i];
+            // calculate the indices to access the global data and the data with respect to the current device
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+            const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -118,8 +119,8 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
  * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -128,68 +129,72 @@ __global__ void device_kernel_symm(const unsigned long long num_rows, const unsi
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_symm_mirror(const unsigned long long num_rows, const unsigned long long num_rhs, const unsigned long long num_mirror_rows, const unsigned long long device_specific_num_rows, const unsigned long long row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # rhs -> num_rhs
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # rows -> num_mirror_rows
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_i = i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_j = j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y - 1ull) * (dim + threadIdx_y) / 2ull + device_specific_num_rows - (dim + threadIdx_y) + global_j];
-            A_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows - row_offset + PADDING_SIZE_ull) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull - 1ull) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) / 2ull + device_specific_num_rows - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) + global_j];
-            B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y) * (num_rhs + PADDING_SIZE_ull) + global_i];
-            B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(row_offset + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rhs + PADDING_SIZE_ull) + global_i];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
-
-        // perform the feature reduction calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_mirror_rows
+
+        // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the shared memory
+                A_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                               // SoA
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
+    // calculate the indices used in the current thread
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
+
     // apply the (remaining) BLAS operation and update C
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto partial_global_j = j + static_cast<unsigned long long>(internal_j);
-            const auto global_j = row_offset + device_specific_num_rows + j + static_cast<unsigned long long>(internal_j);
-
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && partial_global_j < num_mirror_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ull) + global_i];
+            // calculate the indices to access the global data and the data with respect to the current device
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+            const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
+
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -203,27 +208,29 @@ __global__ void device_kernel_symm_mirror(const unsigned long long num_rows, con
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_inplace_matrix_add(const unsigned long long num_cols, real_type *lhs, const real_type *rhs, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # num_rows
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # num_rhs
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto global_j = j + static_cast<unsigned long long>(internal_j);
+            // calculate the indices to access the global data
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j];
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx];  // SoA
         }
     }
 }
@@ -236,27 +243,29 @@ __global__ void device_kernel_inplace_matrix_add(const unsigned long long num_co
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_inplace_matrix_scale(const unsigned long long num_cols, real_type *lhs, const real_type scale, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;  // # num_rows
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;  // # num_rhs
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const auto global_i = i + static_cast<unsigned long long>(internal_i);
-            const auto global_j = j + static_cast<unsigned long long>(internal_j);
+            // calculate the indices to access the global data
+            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ull) + global_j] *= scale;
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale;  // SoA
         }
     }
 }
diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp
index f0e01f813..308867d76 100644
--- a/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/cg_explicit/kernel_matrix_assembly.hip.hpp
@@ -55,7 +55,7 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type
     const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
     const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
-    // create two shared memory arrays used for caching data point features
+    // create two shared memory arrays used for caching
     __shared__ real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
     __shared__ real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
@@ -65,30 +65,30 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
         {
-            // calculate the indices used in the current thread paying attention to coalesced memory accesses
-            const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
-            const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+            const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_uz) {
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    // calculate the indices to access the global data points, pays attention to coalesced memory accesses
-                    const auto global_i_linear = device_row_offset + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_j_linear = device_row_offset + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                     // store the values in the shared memory
-                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
-                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(dim + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                    data_j_cache[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
                         }
                     }
                 }
@@ -97,29 +97,29 @@ __global__ void device_kernel_assembly(real_type *kernel_matrix, const real_type
         }
 
         // calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                // calculate the indices to access the global data points and wrt the current device
-                const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_i = device_row_offset + device_global_i;
-                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = device_row_offset + device_global_j;
-
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                     real_type temp_ij = temp[internal_i][internal_j];
                     // apply the final kernel function
-                    temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j];
+                    temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp_ij += cost;
                     }
                     // update the upper triangular kernel matrix
-                    kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                    kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                 }
             }
         }
diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
index 77820e35a..97ef0798b 100644
--- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
@@ -14,12 +14,14 @@
 #pragma once
 
 #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp"  // plssvm::hip::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                     // plssvm::kernel_function_type
 
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::hip::detail {
 
 /**
@@ -28,10 +30,10 @@ namespace plssvm::hip::detail {
  * @tparam Args the types of the parameters necessary for the specific kernel function
  * @param[in] alpha the scalar alpha value
  * @param[in] q the vector used in the dimensional reduction
- * @param[in] data_d the data points to calculate the implicit kernel matrix from
+ * @param[in] data the data points to calculate the implicit kernel matrix from
  * @param[in] num_rows the total number of data points (= total number of rows)
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] QA_cost the scalar used in the dimensional reduction
  * @param[in] cost the cost factor the diagonal is scaled with
@@ -43,56 +45,64 @@ namespace plssvm::hip::detail {
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const unsigned long long num_rows, const unsigned long long device_num_rows, const unsigned long long row_offset, const unsigned long long num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const unsigned long long num_classes, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
+__global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
 
     // calculate the indices used in the current thread
-    const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
+    const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+
+    // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+    const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+    const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a wavefront must progress further
     if (blockIdx_x >= blockIdx_y) {
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
+        //*************************************************************************//
+        //                   inplace kernel matrix construction                    //
+        //*************************************************************************//
         {
-            // create the shared memory arrays used for caching data point features
-            __shared__ real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-            __shared__ real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            auto data_i_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+            auto data_j_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-                    const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    data_cache_i[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                    data_cache_i[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_i];
-                    data_cache_j[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
-                    data_cache_j[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_rows + 1ull + PADDING_SIZE_ull) + global_j];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the shared memory
+                    data_i_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = data[(feature_block + threadIdx_y) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                    data_cache_j[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
+                            temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                    data_j_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j]);
                         }
                     }
                 }
@@ -103,16 +113,18 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto device_global_i = i + static_cast<unsigned long long>(internal_i);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
-                const auto device_global_j = j + static_cast<unsigned long long>(internal_j);
-
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) {
-                    temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i] - q[global_j];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_i_idx = device_row_offset + device_global_i_idx;
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out of bounds accesses (only using the upper triangular matrix)
+                if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) {
+                    // apply the final kernel function
+                    temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp[internal_i][internal_j] += cost;
                     }
                 } else {
@@ -122,42 +134,44 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
             }
         }
 
-        // calculate C += alpha * temp * B for the UPPER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+        //*************************************************************************//
         {
-            // same shared memory size but with different dimensions
-            __shared__ real_type B_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
-            __shared__ real_type C_out_cache[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][FEATURE_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+            auto B_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_one);
+            auto C_out_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = device_row_offset + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y];
-                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 };
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y + THREAD_BLOCK_SIZE] = real_type{ 0.0 };
+                    // store the values in the shared memory
+                    B_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
+                    C_out_cache[internal * THREAD_BLOCK_SIZE + threadIdx.x][threadIdx.y] = real_type{ 0.0 };                                                                          // SoA
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE] +=
-                                temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % FEATURE_BLOCK_SIZE];
+                            C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE] +=
+                                temp[internal_i][internal_j] * B_cache[threadIdx.x * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + threadIdx.x) % THREAD_BLOCK_SIZE];
                         }
                     }
                     __syncthreads();  // wait until all threads performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_j = row_offset + j + static_cast<unsigned long long>(internal);
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]);
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x + THREAD_BLOCK_SIZE]);
+                    // calculate the indices to access the global data
+                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal);
+
+                    atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[threadIdx.y * INTERNAL_BLOCK_SIZE + internal][threadIdx.x]);  // SoA
                 }
                 __syncthreads();  // wai until all threads updated C with their values
             }
@@ -166,51 +180,55 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
         // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = row_offset + i + static_cast<unsigned long long>(internal_i);
-                const auto global_j = row_offset + j + static_cast<unsigned long long>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
-                if (global_i == global_j) {
+                // update the diagonal
+                if (global_i_idx == global_j_idx) {
                     temp[internal_i][internal_j] = real_type{ 0.0 };
                 }
             }
         }
 
-        // calculate C += alpha * temp * B for the LOWER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+        //*************************************************************************//
         {
-            // same shared memory size but with different dimensions
-            __shared__ real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-            __shared__ real_type C_out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+            // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            auto B_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+            auto C_out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into shared memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_j = row_offset + j_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_j_idx_linear = device_row_offset + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y];
-                    B_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
+                    // store the values in the shared memory
+                    B_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
                     C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
-                    C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
                 }
                 __syncthreads();  // wait until all threads loaded their part of the data
 
                 // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] +=
-                                temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j];
+                            C_out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + threadIdx.x] +=
+                                temp[internal_i][internal_j] * B_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_j];
                         }
                     }
                     __syncthreads();  // wait until all threads performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = row_offset + i + static_cast<unsigned long long>(internal);
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], C_out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal);
+
+                    atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);  // SoA
                 }
                 __syncthreads();  // wait until all threads updated C with their values
             }
diff --git a/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp b/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp
index a98bb0715..1b2be0ae6 100644
--- a/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/kernel_functions.hip.hpp
@@ -51,42 +51,17 @@ template <>
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] __device__ __forceinline__ real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
index 6e349927e..9aaba6c5e 100644
--- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
@@ -14,169 +14,183 @@
 #pragma once
 
 #include "plssvm/backends/HIP/kernel/kernel_functions.hip.hpp"  // plssvm::hip::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                     // plssvm::kernel_function_type
 
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 
+#include <cstddef>  // std::size_t
+
 namespace plssvm::hip::detail {
 
 /**
  * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
- * @param[out] w_d the vector to speedup the linear prediction
- * @param[in] alpha_d the previously learned weights
- * @param[in] sv_d the support vectors
+ * @param[out] w the vector to speedup the linear prediction
+ * @param[in] alpha the previously learned weights
+ * @param[in] support_vectors the support vectors
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+ * @param[in] device_num_sv the number of support vectors the current device is responsible for
+ * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long device_specific_num_sv, const unsigned long long sv_offset, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_feature_idx = feature_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_class_idx = class_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_features
+        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;    // num_classes
 
-            data_cache_feature[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ull) + sv + threadIdx_y];  // SoA
-            data_cache_alpha[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ull) + sv + sv_offset + threadIdx_y];       // AoS
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_feature][internal_class] += data_cache_alpha[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature];
+                // store the values in the shared memory
+                feature_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y];  // SoA
+                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y];    // AoS
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_feature][internal_class] += alpha_cache[sv][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_feature];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
-    // update global array with local one
+    // calculate the indices used in the current thread
+    const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
+
+    // update the global w-vector with the locally cached values
     for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-            const auto global_feature_idx = feature_idx + static_cast<unsigned long long>(internal_feature);
-            const auto global_class_idx = class_idx + static_cast<unsigned long long>(internal_class);
+            // calculate the indices to access the global data
+            const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-            w_d[global_feature_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_feature][internal_class];
+            w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
         }
     }
 }
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
- * @param[out] prediction_d the predicted values
- * @param[in] w_d the vector to speedup the calculations
- * @param[in] rho_d the previously learned bias
- * @param[in] predict_points_d the data points to predict
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
+ * @param[out] prediction the predicted values
+ * @param[in] w the vector to speedup the calculations
+ * @param[in] rho the previously learned bias
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_predict_points the number of data points to predict
  * @param[in] num_features the number of features per data point
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__global__ void device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ull;
-    const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-
-    // create the shared memory arrays used for caching data point features
-    __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __shared__ real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+__global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
-        // load data into shared memory
-        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const auto global_pp_idx = pp_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-            const auto global_class_idx = class_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE_ull;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-            data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-            data_cache_w[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ull) + global_class_idx];
-            data_cache_w[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_classes + PADDING_SIZE_ull) + global_class_idx];
-        }
-        __syncthreads();  // wait until all threads loaded their part of the data
+    {
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;     // num_predict_points
+        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_classes
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into shared memory
+            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-        // perform the dot product calculation
-        for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_pd][internal_class] += data_cache_w[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd];
+                // store the values in the shared memory
+                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                w_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+            }
+            __syncthreads();  // wait until all threads loaded their part of the data
+
+            // perform the dot product calculation
+            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_pp][internal_class] += w_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp];
+                    }
                 }
             }
+            __syncthreads();  // wait until all threads performed their part of the calculations
         }
-        __syncthreads();  // wait until all threads performed their part of the calculations
     }
 
-    // update global array with local one
-    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    // calculate the indices used in the current thread
+    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+    const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+    // update the global array with the local one
+    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-            const auto global_pp_idx = pp_idx + static_cast<unsigned long long>(internal_pd);
-            const auto global_class_idx = class_idx + static_cast<unsigned long long>(internal_class);
+            // calculate the indices to access the global data
+            const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-            prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx];
+            prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx];  // AoS
         }
     }
 }
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
- * @param[in] prediction_d the predicted values
- * @param[in] alpha_d the previously learned weights
- * @param[in] rho_d the previously learned biases
- * @param[in] sv_d the support vectors
- * @param[in] predict_points_d the data points to predict
+ * @param[in] prediction the predicted values
+ * @param[in] alpha the previously learned weights
+ * @param[in] rho the previously learned biases
+ * @param[in] sv the support vectors
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
  * @param[in] num_predict_points the number of data points to predict
@@ -186,53 +200,55 @@ __global__ void device_kernel_predict_linear(real_type *prediction_d, const real
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const unsigned long long num_classes, const unsigned long long num_sv, const unsigned long long num_predict_points, const unsigned long long num_features, const unsigned long long grid_x_offset, const unsigned long long grid_y_offset, Args... kernel_function_parameter) {
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto threadIdx_x = static_cast<unsigned long long>(threadIdx.x);                // current thread in block x-dimension
-    const auto threadIdx_y = static_cast<unsigned long long>(threadIdx.y);                // current thread in block y-dimension
-    const auto blockDim_x = static_cast<unsigned long long>(blockDim.x);                  // number of threads in block x-dimension
-    const auto blockDim_y = static_cast<unsigned long long>(blockDim.y);                  // number of threads in block y-dimension
-    const auto blockIdx_x = static_cast<unsigned long long>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const auto blockIdx_y = static_cast<unsigned long long>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-    const auto INTERNAL_BLOCK_SIZE_ull = static_cast<unsigned long long>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_ull = static_cast<unsigned long long>(THREAD_BLOCK_SIZE);
-    const auto FEATURE_BLOCK_SIZE_ull = static_cast<unsigned long long>(FEATURE_BLOCK_SIZE);
-    const auto PADDING_SIZE_ull = static_cast<unsigned long long>(PADDING_SIZE);
-
-    // calculate the indices used in the current thread
-    const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ull;
-    const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
-    const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ull + threadIdx_x;
+__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+    const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
+    const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
+    const auto blockDim_x = static_cast<std::size_t>(blockDim.x);                  // number of threads in block x-dimension
+    const auto blockDim_y = static_cast<std::size_t>(blockDim.y);                  // number of threads in block y-dimension
+    const auto blockIdx_x = static_cast<std::size_t>(blockIdx.x) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size is too large
+    const auto blockIdx_y = static_cast<std::size_t>(blockIdx.y) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size is too large
+
+    // create two shared memory arrays used for caching
+    __shared__ real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __shared__ real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
     // create a thread private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
     {
-        // create the shared memory arrays used for caching data point features
-        __shared__ real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        __shared__ real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        auto pp_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+        auto sv_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
+
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_predict_points
+        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
 
         // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ull) {
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE;
-                const auto global_sv_idx = sv_idx_linear + static_cast<unsigned long long>(internal) * THREAD_BLOCK_SIZE;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_pp[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-                data_cache_pp[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_predict_points + PADDING_SIZE_ull) + global_pp_idx];
-                data_cache_sv[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
-                data_cache_sv[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
+                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
+
+                // store the values in the shared memory
+                pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];                          // SoA
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                  data_cache_pp[block_dim][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pd]);
+                        temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache[feature][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                  pp_cache[feature][threadIdx.x * INTERNAL_BLOCK_SIZE + internal_pp]);
                     }
                 }
             }
@@ -241,55 +257,57 @@ __global__ void device_kernel_predict(real_type *prediction_d, const real_type *
     }
 
     // update temp using the respective kernel function
-    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-            temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter...);
+            temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter...);
         }
     }
 
     {
-        // same shared memory size but with different dimensions
-        __shared__ real_type alpha_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        __shared__ real_type out_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        // reinterpret the shared memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        auto alpha_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+        auto out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ull) {
+        // calculate the indices used in the current thread
+        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+        // iterate over all classes using blocking to be able to cache them for faster memory accesses
+        for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const unsigned long long global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
-                alpha_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ull) * (num_sv + PADDING_SIZE_ull) + global_sv_idx];
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
 
+                // store the values in the shared memory
+                alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                 // the bias (rho) must only be applied once for all support vectors
-                if (blockIdx_y == 0ull) {
-                    out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y];
-                    out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ull];
+                if (blockIdx_y == std::size_t{ 0 }) {
+                    out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = -rho[class_block + threadIdx_y];
                 } else {
                     out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
-                    out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x] = real_type{ 0.0 };
                 }
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
             // calculate intermediate results and store them in shared memory
-            for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        out_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + threadIdx.x] +=
-                            temp[internal_pd][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % FEATURE_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv];
+                        out_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + threadIdx.x] +=
+                            temp[internal_pp][internal_sv] * alpha_cache[(class_idx + threadIdx.y) % THREAD_BLOCK_SIZE][threadIdx.y * INTERNAL_BLOCK_SIZE + internal_sv];
                     }
                 }
                 __syncthreads();  // wait until all threads performed their part of the calculations
             }
 
-            // add intermediate cached results to prediction_d
+            // atomically add the intermediate cached results to the prediction
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx + static_cast<unsigned long long>(internal);
+                // calculate the indices to access the global data
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ull) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ull], out_cache[threadIdx.y + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
+                atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x]);
             }
             __syncthreads();  // wait until all threads updated their part of the prediction
         }

From 45832e70abd46ed5b4042abebef83dcba8c8d32a Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 14:48:38 +0200
Subject: [PATCH 38/93] Fix Doxygen documentation.

---
 include/plssvm/detail/make_unique_for_overwrite.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp
index 51b56e126..06f4cbaa5 100644
--- a/include/plssvm/detail/make_unique_for_overwrite.hpp
+++ b/include/plssvm/detail/make_unique_for_overwrite.hpp
@@ -27,14 +27,14 @@ template <typename T>
 struct is_unbounded_array : std::false_type { };
 
 /**
- * @brief Specialization of @ref is_unbounded_array for unbounded arrays.
+ * @brief Specialization of @ref plssvm::detail::is_unbounded_array for unbounded arrays.
  * @tparam T the array type
  */
 template <typename T>
 struct is_unbounded_array<T[]> : std::true_type { };
 
 /**
- * @brief Shortcut for @ref is_unbounded_array::value.
+ * @brief Shortcut for @ref plssvm::detail::is_unbounded_array.
  * @tparam T the array type
  */
 template <typename T>
@@ -48,7 +48,7 @@ template <typename T>
 struct is_bounded_array : std::false_type { };
 
 /**
- * @brief Specialization of @ref is_bounded_array for unbounded arrays.
+ * @brief Specialization of @ref plssvm::detail::is_bounded_array for unbounded arrays.
  * @tparam T the array type
  * @tparam N the size of the array
  */
@@ -56,7 +56,7 @@ template <typename T, std::size_t N>
 struct is_bounded_array<T[N]> : std::true_type { };
 
 /**
- * @brief Shortcut for @ref is_unbounded_array::value.
+ * @brief Shortcut for @ref plssvm::detail::is_bounded_array.
  * @tparam T the array type
  */
 template <typename T>

From 10ff3c26011db0042860d5ca8df6cd663ac988bd Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 17:33:47 +0200
Subject: [PATCH 39/93] Add additional assert.

---
 include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp      | 2 ++
 .../OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp        | 1 +
 2 files changed, 3 insertions(+)

diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
index 81f560421..3fbbaaa4b 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
@@ -38,6 +38,7 @@ namespace plssvm::openmp::detail {
  */
 inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
     // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
+    PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
@@ -125,6 +126,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
  */
 inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
     // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
+    PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
index f384645b1..381c8adf7 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -41,6 +41,7 @@ namespace plssvm::openmp::detail {
  */
 template <kernel_function_type kernel_function, typename... Args>
 void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+    PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!");
     PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
     PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
     PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());

From dad55f2688eb18ee01684cf9b605dd529273da54 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 17:34:57 +0200
Subject: [PATCH 40/93] Fix variable names.

---
 .../OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp      | 4 ++--
 .../OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp | 4 ++--
 include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp      | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 381c8adf7..b442288df 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -51,7 +51,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type
     const std::size_t num_rows = data.num_rows() - 1;
     const std::size_t num_features = data.num_cols();
     const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
@@ -60,7 +60,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type
 
 #pragma omp parallel for collapse(2) schedule(dynamic)
     for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t col_block = 0; col_block < blocked_device_specific_num_rows; col_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t col_block = 0; col_block < blocked_device_num_rows; col_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
             for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
                 for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) {
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 3ca4e4dc6..067608773 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -54,7 +54,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
     const std::size_t num_features = data.num_cols();
     const std::size_t num_classes = B.num_rows();
     const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
@@ -62,7 +62,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
 
 #pragma omp parallel for collapse(2) schedule(dynamic)
     for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) {
-        for (std::size_t col_block = 0; col_block < blocked_device_specific_num_rows; col_block += THREAD_BLOCK_SIZE_uz) {
+        for (std::size_t col_block = 0; col_block < blocked_device_num_rows; col_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
             for (std::size_t row_thread = 0; row_thread < THREAD_BLOCK_SIZE_uz; ++row_thread) {
                 for (std::size_t col_thread = 0; col_thread < THREAD_BLOCK_SIZE_uz; ++col_thread) {
diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index 17696bd90..a9fa64d07 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -198,7 +198,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
     const std::size_t num_classes = alpha.num_rows();
     const std::size_t num_support_vectors = support_vectors.num_rows();
     const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_features = predict_points.num_cols();
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
@@ -213,7 +213,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
     }
 
 #pragma omp parallel for collapse(2)
-    for (std::size_t pp_block = 0; pp_block < blocked_device_specific_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) {
+    for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) {
         for (std::size_t sv_block = 0; sv_block < blocked_num_support_vectors; sv_block += THREAD_BLOCK_SIZE_uz) {
             // perform operations on the current block
             for (std::size_t pp_thread = 0; pp_thread < THREAD_BLOCK_SIZE_uz; ++pp_thread) {

From e6b76f2d90c21fb560f4af0b2187c7e4ae195594 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 17:35:18 +0200
Subject: [PATCH 41/93] Use typename instead of class.

---
 include/plssvm/detail/make_unique_for_overwrite.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp
index 06f4cbaa5..8e7603cc1 100644
--- a/include/plssvm/detail/make_unique_for_overwrite.hpp
+++ b/include/plssvm/detail/make_unique_for_overwrite.hpp
@@ -68,7 +68,7 @@ constexpr bool is_bounded_array_v = is_bounded_array<T>::value;
  * @tparam T the type of the object to create
  * @return a unique pointer to the newly created object (`[[nodiscard]]`)
  */
-template <class T, std::enable_if_t<std::is_array_v<T>, bool> = true>
+template <typename T, std::enable_if_t<std::is_array_v<T>, bool> = true>
 [[nodiscard]] std::unique_ptr<T> make_unique_for_overwrite() {
     return std::unique_ptr<T>(new T);
 }
@@ -80,7 +80,7 @@ template <class T, std::enable_if_t<std::is_array_v<T>, bool> = true>
  * @param[in] n the size of the array to create
  * @return a unique pointer to the newly created object (`[[nodiscard]]`)
  */
-template <class T, std::enable_if_t<is_unbounded_array_v<T>, bool> = true>
+template <typename T, std::enable_if_t<is_unbounded_array_v<T>, bool> = true>
 std::unique_ptr<T> make_unique_for_overwrite(const std::size_t n) {
     return std::unique_ptr<T>(new std::remove_extent_t<T>[n]);
 }
@@ -93,7 +93,7 @@ std::unique_ptr<T> make_unique_for_overwrite(const std::size_t n) {
  * @param[in] args the arguments to pass to the constructor
  * @return a unique pointer to the newly created object (`[[nodiscard]]`)
  */
-template <class T, class... Args, std::enable_if_t<is_bounded_array_v<T>, bool> = true>
+template <typename T, typename... Args, std::enable_if_t<is_bounded_array_v<T>, bool> = true>
 auto make_unique_for_overwrite(Args &&...args) = delete;
 
 }  // namespace plssvm::detail

From 5913b5028aebc2f9492f61ad830828eec17a3ee4 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 17:36:06 +0200
Subject: [PATCH 42/93] Move parallel zero memset to header function (used in
 multiple places).

---
 .../detail/make_unique_for_overwrite.hpp       | 18 ++++++++++++++++++
 src/plssvm/backends/OpenMP/csvm.cpp            | 13 +++----------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp
index 8e7603cc1..fcb205622 100644
--- a/include/plssvm/detail/make_unique_for_overwrite.hpp
+++ b/include/plssvm/detail/make_unique_for_overwrite.hpp
@@ -13,7 +13,10 @@
 #ifndef PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_
 #define PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_
 
+#include "plssvm/detail/assert.hpp"  // PLSSVM_ASSERT
+
 #include <cstddef>      // std::size_t
+#include <cstring>      // std::memset
 #include <memory>       // std::unique_ptr
 #include <type_traits>  // std::false_type, std::true_type, std::enable_if_t, std::is_array_v
 
@@ -96,6 +99,21 @@ std::unique_ptr<T> make_unique_for_overwrite(const std::size_t n) {
 template <typename T, typename... Args, std::enable_if_t<is_bounded_array_v<T>, bool> = true>
 auto make_unique_for_overwrite(Args &&...args) = delete;
 
+template <typename T>
+void parallel_zero_memset(T *dest, const std::size_t count) {
+    PLSSVM_ASSERT(dest != nullptr, "The destination pointer may not be a nullptr!");
+
+// initialize the data pointed to by dest to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset
+#if defined(_OPENMP)
+    #pragma omp parallel for
+    for (std::size_t i = 0; i < count; ++i) {
+        dest[i] = T{ 0 };
+    }
+#else
+    std::memset(dest, 0, count * sizeof(T));
+#endif
+}
+
 }  // namespace plssvm::detail
 
 #endif  // PLSSVM_DETAIL_MAKE_UNIQUE_FOR_OVERWRITE_HPP_
diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp
index d34b25066..868ab32e6 100644
--- a/src/plssvm/backends/OpenMP/csvm.cpp
+++ b/src/plssvm/backends/OpenMP/csvm.cpp
@@ -19,7 +19,7 @@
 #include "plssvm/detail/assert.hpp"                                                   // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/logging/mpi_log_untracked.hpp"                                // plssvm::detail::log_untracked
-#include "plssvm/detail/make_unique_for_overwrite.hpp"                                // plssvm::detail::make_unique_for_overwrite
+#include "plssvm/detail/make_unique_for_overwrite.hpp"                                // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset}
 #include "plssvm/detail/memory_size.hpp"                                              // plssvm::detail::memory_size
 #include "plssvm/detail/move_only_any.hpp"                                            // plssvm::detail::{move_only_any, move_only_any_cast}
 #include "plssvm/detail/tracking/performance_tracker.hpp"                             // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY
@@ -131,15 +131,8 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
 
                     // only explicitly store the upper triangular matrix
                     auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite<real_type[]>(num_entries);
-                    // initialize kernel matrix to all zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset
-#if defined(_OPENMP)
-    #pragma omp parallel for
-                    for (std::size_t i = 0; i < num_entries; ++i) {
-                        kernel_matrix[i] = real_type{ 0.0 };
-                    }
-#else
-                    std::memset(kernel_matrix.get(), 0, num_entries * sizeof(real_type));
-#endif
+                    // initialize kernel matrix to all zeros in parallel
+                    ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries);
 
                     const auto start = std::chrono::steady_clock::now();
                     switch (params.kernel_type) {

From a67751bd9461722cbed2acf7dbc421722d4a5652 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 18:22:26 +0200
Subject: [PATCH 43/93] Add documentation and rearrange constant declarations.

---
 .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp   | 1 +
 include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 9 +++------
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 067608773..391b9fd90 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -122,6 +122,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
                         for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    // calculate the indices to access the global data
                                     const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
                                     const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
 
diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index a9fa64d07..1eed9735e 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -43,8 +43,6 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
     const std::size_t num_features = support_vectors.num_cols();
-
-    // calculate constants
     const auto blocked_num_features = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_features) / INTERNAL_BLOCK_SIZE));
     const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
@@ -65,6 +63,7 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
+                    // iterate over all support vectors
                     for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
@@ -117,8 +116,6 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
     // calculate constants
     const std::size_t num_classes = prediction.num_cols();
     const std::size_t num_features = predict_points.num_cols();
-
-    // calculate constants
     const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
     const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
@@ -139,6 +136,7 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
+                    // iterate over all features
                     for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
@@ -197,9 +195,9 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
     const std::size_t num_support_vectors = support_vectors.num_rows();
+    const std::size_t num_features = predict_points.num_cols();
     const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
     const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_features = predict_points.num_cols();
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
@@ -258,7 +256,6 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                                 const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
                                 const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                                // be sure to not perform out-of-bounds accesses
                                 for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
 #pragma omp atomic
                                     prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv];

From 54741fff26ea2fa0ebbd3e508895451682ad88c1 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 20:12:51 +0200
Subject: [PATCH 44/93] Inverse all temp indices for better consistency.

---
 .../plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp    | 8 ++++----
 .../OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp  | 4 ++--
 include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp  | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
index 3fbbaaa4b..01db6a60e 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
@@ -86,7 +86,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
                                     }
                                     sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
                                 }
-                                temp[internal_i][internal_j] += sum;
+                                temp[internal_j][internal_i] += sum;
                             }
                         }
                     }
@@ -101,7 +101,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
 
                             // be sure to not perform out-of-bounds accesses
                             if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
-                                C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx);
+                                C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx);
                             }
                         }
                     }
@@ -169,7 +169,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
                                     const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx];
                                     sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
                                 }
-                                temp[internal_i][internal_j] += sum;
+                                temp[internal_j][internal_i] += sum;
                             }
                         }
                     }
@@ -184,7 +184,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
 
                             // be sure to not perform out-of-bounds accesses
                             if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
-                                C(global_i_idx, global_j_idx) = alpha * temp[internal_i][internal_j] + beta * C(global_i_idx, global_j_idx);
+                                C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx);
                             }
                         }
                     }
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
index b442288df..aa465dead 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -86,7 +86,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type
                                     for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
                                         sum += detail::feature_reduce<kernel_function>(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature));
                                     }
-                                    temp[internal_i][internal_j] += sum;
+                                    temp[internal_j][internal_i] += sum;
                                 }
                             }
                         }
@@ -102,7 +102,7 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type
 
                                 // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
                                 if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
-                                    real_type temp_ij = temp[internal_i][internal_j];
+                                    real_type temp_ij = temp[internal_j][internal_i];
                                     // apply the final kernel function
                                     temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                                     // apply the cost on the diagonal
diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index 1eed9735e..7bea4b3c4 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -236,7 +236,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                                 for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
                                     sum += detail::feature_reduce<kernel_function>(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature));
                                 }
-                                temp[internal_pp][internal_sv] += sum;
+                                temp[internal_sv][internal_pp] += sum;
                             }
                         }
                     }
@@ -244,7 +244,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                     // update temp using the respective kernel function
                     for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter...);
+                            temp[internal_sv][internal_pp] = detail::apply_kernel_function<kernel_function>(temp[internal_sv][internal_pp], kernel_function_parameter...);
                         }
                     }
 
@@ -258,7 +258,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
 
                                 for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
 #pragma omp atomic
-                                    prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_pp][internal_sv];
+                                    prediction(global_pp_idx, class_block + class_idx) += alpha(class_block + class_idx, global_sv_idx) * temp[internal_sv][internal_pp];
                                 }
                             }
                         }

From 46891d9b43158ce084aad132f9d90947a28ab7bb Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 20:31:54 +0200
Subject: [PATCH 45/93] Add missing doxygen documentation.

---
 include/plssvm/detail/make_unique_for_overwrite.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/plssvm/detail/make_unique_for_overwrite.hpp b/include/plssvm/detail/make_unique_for_overwrite.hpp
index fcb205622..ca58eec3a 100644
--- a/include/plssvm/detail/make_unique_for_overwrite.hpp
+++ b/include/plssvm/detail/make_unique_for_overwrite.hpp
@@ -99,6 +99,12 @@ std::unique_ptr<T> make_unique_for_overwrite(const std::size_t n) {
 template <typename T, typename... Args, std::enable_if_t<is_bounded_array_v<T>, bool> = true>
 auto make_unique_for_overwrite(Args &&...args) = delete;
 
+/**
+ * @brief Fill the array @p dest with zeros in parallel using OpenMP if available, otherwise fall back to a sequential memset.
+ * @tparam T the type of the values
+ * @param[in,out] dest the array to fill with zeros
+ * @param[in] count the number of values to fill
+ */
 template <typename T>
 void parallel_zero_memset(T *dest, const std::size_t count) {
     PLSSVM_ASSERT(dest != nullptr, "The destination pointer may not be a nullptr!");

From fa5cea380199ef9c8204c2ffd4890ec389493c87 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 31 May 2025 20:56:51 +0200
Subject: [PATCH 46/93] Update the HPX backend kernels.

---
 .../backends/HPX/kernel/cg_explicit/blas.hpp  | 130 +++++++-------
 .../cg_explicit/kernel_matrix_assembly.hpp    |  87 ++++-----
 .../kernel_matrix_assembly_blas.hpp           | 111 +++++++-----
 .../backends/HPX/kernel/kernel_functions.hpp  |  35 +---
 .../backends/HPX/kernel/predict_kernel.hpp    | 165 +++++++++---------
 src/plssvm/backends/HPX/csvm.cpp              |  35 ++--
 6 files changed, 299 insertions(+), 264 deletions(-)

diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
index 20cbad247..99aeec376 100644
--- a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
@@ -34,60 +34,63 @@ namespace plssvm::hpx::detail {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a symmetric matrix (memory optimized), @p B and @p C are matrices, and @p alpha and @p beta are scalars.
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
  * @param[in] beta the scalar beta value
  * @param[in,out] C the matrix @p C, also used as result matrix
  */
-inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
+inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+    PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
 
     // calculate constants
     const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
     const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_num_rhs * blocked_device_specific_num_rows);  // define range over which should be iterated
+    // define the range over which should be iterated
+    std::vector<std::size_t> range(blocked_num_rhs * blocked_device_num_rows);
     std::iota(range.begin(), range.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t rhs = idx / blocked_device_specific_num_rows;
-        const std::size_t row = idx % blocked_device_specific_num_rows;
-
-        const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
 
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all features
-        for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) {
+        // iterate over all values
+        for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
             // perform the dot product calculation
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                    const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                    real_type A_val = 0.0;
-                    // determine on which side of the diagonal we are located
-                    if (dim < global_row) {
-                        A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }];
-                    } else {
-                        A_val = A[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                    real_type sum{ 0.0 };
+                    for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                        real_type A_cache = 0.0;
+                        // determine on which side of the diagonal we are located
+                        if (dim_block + dim < global_j_idx) {
+                            A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];
+                        } else {
+                            A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
+                        }
+                        sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
                     }
-                    temp[internal_i][internal_j] += A_val * B(global_rhs, dim + row_offset);
+                    temp[internal_j][internal_i] += sum;
                 }
             }
         }
@@ -95,13 +98,14 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
         // apply the (partial) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) {
-                    C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row);
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                    C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx);
                 }
             }
         }
@@ -113,22 +117,22 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
  * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
  * @param[in] beta the scalar beta value
  * @param[in,out] C the matrix @p C, also used as result matrix
  */
-inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
     // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
+    PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
     PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
     PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
     PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
+    PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
 
     // calculate constants
     const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
@@ -136,47 +140,51 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
     const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_num_rhs * blocked_num_mirror_rows);  // define range over which should be iterated
+    // define the range over which should be iterated
+    std::vector<std::size_t> range(blocked_num_rhs * blocked_num_mirror_rows);
     std::iota(range.begin(), range.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t rhs = idx / blocked_num_mirror_rows;
-        const std::size_t row = idx % blocked_num_mirror_rows;
-
-        const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t i_idx = (idx / blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j_idx = (idx % blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz;
 
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all features
-        for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) {
+        // iterate over the remaining values
+        for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
             // perform the dot product calculation
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                    const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                    const real_type A_val = A[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row];
-                    temp[internal_i][internal_j] += A_val * B(global_rhs, row_offset + dim);
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                    real_type sum{ 0.0 };
+                    for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                        const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx];
+                        sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
+                    }
+                    temp[internal_j][internal_i] += sum;
                 }
             }
         }
 
-        // apply the (partial) BLAS operation and update C
+        // apply the (remaining) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                const std::size_t partial_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) {
-                    C(global_rhs, global_row) = alpha * temp[internal_i][internal_j] + beta * C(global_rhs, global_row);
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                    C(global_i_idx, global_j_idx) = alpha * temp[internal_j][internal_i] + beta * C(global_i_idx, global_j_idx);
                 }
             }
         }
diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
index af1d3c9e2..f4bf41d0d 100644
--- a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -32,82 +32,89 @@
 namespace plssvm::hpx::detail {
 
 /**
- * @brief Assemble the kernel matrix using the @p kernel function.
- * @tparam kernel the compile-time kernel function to use
- * @tparam Args the types of the potential additional arguments for the @p kernel function
+ * @brief Assemble the kernel matrix using the @p kernel_function function.
+ * @tparam kernel_function the compile-time kernel function to use
+ * @tparam Args the types of the potential additional arguments for the @p kernel_function function
  * @param[out] kernel_matrix the resulting kernel matrix
  * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] q the `q` vector
  * @param[in] QA_cost he bottom right matrix entry multiplied by cost
  * @param[in] cost 1 / the cost parameter in the C-SVM
- * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
+ * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function
  */
-template <kernel_function_type kernel, typename... Args>
-void device_kernel_assembly(std::vector<real_type> &kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+    PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!");
     PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!");
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
+    PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+    PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
     PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
 
     // calculate constants
     const std::size_t num_rows = data.num_rows() - 1;
     const std::size_t num_features = data.num_cols();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
     const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-    // count the number of entries in the final index list
-    std::vector<std::size_t> indices(blocked_row_range * blocked_device_specific_num_rows);  // define range over which should be iterated
+    // define the range over which should be iterated
+    std::vector<std::size_t> indices(blocked_row_range * blocked_device_num_rows);
     std::iota(indices.begin(), indices.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, indices.cbegin(), indices.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
 
         // only calculate the upper triangular matrix
-        if (row_idx >= col_idx) {
-            // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs
+        if (i_idx >= j_idx) {
             // create a thread private array used for internal caching
             std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
             // iterate over all features
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // perform the feature reduction calculation
-                for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                    for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                        const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                        const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                        temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data(global_row, dim), data(global_col, dim));
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            sum += detail::feature_reduce<kernel_function>(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature));
+                        }
+                        temp[internal_j][internal_i] += sum;
                     }
                 }
             }
 
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
-            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                    // calculate the indices to access the kernel matrix (the part stored on the current device)
-                    const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                        real_type temp_ij = temp[internal_row][internal_col];
-                        temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col];
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                        real_type temp_ij = temp[internal_j][internal_i];
+                        // apply the final kernel function
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_row == global_col) {
+                        if (global_i_idx == global_j_idx) {
                             temp_ij += cost;
                         }
-                        kernel_matrix[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij;
+                        // update the upper triangular kernel matrix
+                        kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                     }
                 }
             }
diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 06df89dac..78a0f93d1 100644
--- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -34,25 +34,25 @@
 namespace plssvm::hpx::detail {
 
 /**
- * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
- * @tparam kernel the compile-time kernel function to use
- * @tparam Args the types of the potential additional arguments for the @p kernel function
+ * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @tparam kernel_function the compile-time kernel function to use
+ * @tparam Args the types of the potential additional arguments for the @p kernel_function function
  * @param[in] alpha the scalar alpha value
  * @param[in] q the `q` vector
  * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
+ * @param[in] device_num_rows the number of rows the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] QA_cost he bottom right matrix entry multiplied by cost
  * @param[in] cost 1 / the cost parameter in the C-SVM
  * @param[in] B the matrix @p B
  * @param[in,out] C the matrix @p C
- * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
+ * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel_function function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_assembly_symm(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+inline void device_kernel_assembly_symm(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
     PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
+    PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+    PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
     PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
     PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!");
     PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size());
@@ -61,64 +61,89 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
     const std::size_t num_rows = data.num_rows() - 1;
     const std::size_t num_features = data.num_cols();
     const std::size_t num_classes = B.num_rows();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
-    // count the number of entries in the final index list
-    std::vector<std::size_t> indices(blocked_row_range * blocked_device_specific_num_rows);  // define range over which should be iterated
+    // define the range over which should be iterated
+    std::vector<std::size_t> indices(blocked_row_range * blocked_device_num_rows);
     std::iota(indices.begin(), indices.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, indices.cbegin(), indices.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
 
         // only calculate the upper triangular matrix
-        if (row_idx >= col_idx) {
-            // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs
+        if (i_idx >= j_idx) {
             // create a thread private array used for internal caching
             std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
             // iterate over all features
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
-                for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                    for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                        const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                        const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                        temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data(global_row, dim), data(global_col, dim));
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            sum += detail::feature_reduce<kernel_function>(data(global_i_idx, feature_block + feature), data(global_j_idx, feature_block + feature));
+                        }
+                        temp[internal_j][internal_i] += sum;
                     }
                 }
             }
 
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
-            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                    const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset + device_global_j_idx;
 
                     // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                        real_type temp_ij = temp[internal_row][internal_col];
-                        temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q[global_row] - q[global_col];
+                    if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                        // apply the final kernel function
+                        temp[internal_j][internal_i] = detail::apply_kernel_function<kernel_function>(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_row == global_col) {
-                            temp_ij += cost;
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
-                                atomic_ref<real_type>{ C(class_idx, global_row) } += alpha * temp_ij * B(class_idx, global_row);
+                        if (global_i_idx == global_j_idx) {
+                            temp[internal_j][internal_i] += cost;
+                        }
+                    } else {
+                        // be sure to set the value to zero otherwise
+                        temp[internal_j][internal_i] = real_type{ 0.0 };
+                    }
+                }
+            }
+
+            //*************************************************************************//
+            //                     calculate C += alpha * temp * B                     //
+            //*************************************************************************//
+            for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                        if (global_i_idx == global_j_idx) {
+                            // only apply once to the diagonal
+                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                                atomic_ref<real_type>{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);
                             }
                         } else {
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
-                                atomic_ref<real_type>{ C(class_idx, global_row) } += alpha * temp_ij * B(class_idx, global_col);
+                            // apply it for the upper and lower triangular matrix
+                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                                atomic_ref<real_type>{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx);
                                 // symmetry
-                                atomic_ref<real_type>{ C(class_idx, global_col) } += alpha * temp_ij * B(class_idx, global_row);
+                                atomic_ref<real_type>{ C(class_block + class_idx, global_j_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);
                             }
                         }
                     }
diff --git a/include/plssvm/backends/HPX/kernel/kernel_functions.hpp b/include/plssvm/backends/HPX/kernel/kernel_functions.hpp
index 6c0cd8a43..35e79d01d 100644
--- a/include/plssvm/backends/HPX/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/HPX/kernel/kernel_functions.hpp
@@ -28,42 +28,17 @@ namespace plssvm::hpx::detail {
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] inline real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
index 7ea68e172..050425b8a 100644
--- a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
@@ -16,7 +16,7 @@
 
 #include "plssvm/backends/HPX/detail/utility.hpp"           // plssvm::hpx::detail::atomic_ref
 #include "plssvm/backends/HPX/kernel/kernel_functions.hpp"  // plssvm::hpx::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                             // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                             // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                 // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                                // plssvm::aos_matrix, plssvm::soa_matrix
@@ -38,59 +38,63 @@ namespace plssvm::hpx::detail {
  * @param[out] w the vector to speedup the linear prediction
  * @param[in] alpha the previously learned weights
  * @param[in] support_vectors the support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for
+ * @param[in] device_num_sv the number of support vectors the current device is responsible for
+ * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for
  */
-inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) {
+inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) {
     PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
     PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }));
-    PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows());
-    PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows());
+    PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows());
+    PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows());
 
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
-    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_features = support_vectors.num_cols();
     const auto blocked_num_features = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_features) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
-    // define range over which should be iterated
+    // define the range over which should be iterated
     std::vector<std::size_t> range(blocked_num_classes * blocked_num_features);
     std::iota(range.begin(), range.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t feature = idx / blocked_num_classes;
-        const std::size_t c = idx % blocked_num_classes;
-
-        const std::size_t feature_idx = feature * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t feature_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
 
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all features
-        for (std::size_t sv = 0; sv < device_specific_num_sv; ++sv) {
-            // perform the feature reduction calculation
+        // iterate over all support vectors
+        for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
+            // perform the dot product calculation
             for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                 for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const std::size_t global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
-                    const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                    // calculate the indices to access the global data
+                    const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                    temp[internal_feature][internal_class] += alpha(global_class_idx, sv_offset + sv) * support_vectors(sv_offset + sv, global_feature_idx);
+                    real_type sum{ 0.0 };
+                    for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                        sum += alpha(global_class_idx, device_sv_offset + sv_block + sv) * support_vectors(device_sv_offset + sv_block + sv, global_feature_idx);
+                    }
+                    temp[internal_class][internal_feature] += sum;
                 }
             }
         }
 
-        // update global array with local one
+        // store the result back to the w vector
         for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const std::size_t global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
-                const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                // calculate the indices to access the global data
+                const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                w(global_class_idx, global_feature_idx) = temp[internal_feature][internal_class];
+                w(global_class_idx, global_feature_idx) = temp[internal_class][internal_feature];
             }
         }
     });
@@ -102,63 +106,64 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
  * @param[in] w the vector to speedup the calculations
  * @param[in] rho the previously learned bias
  * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
+ * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+ * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
  */
-inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) {
+inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) {
     PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size());
     PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols());
     PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
 
     // calculate constants
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_classes = prediction.num_cols();
-    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_features = predict_points.num_cols();
+    const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_device_specific_num_predict_points * blocked_num_classes);
+    // define the range over which should be iterated
+    std::vector<std::size_t> range(blocked_device_num_predict_points * blocked_num_classes);
     std::iota(range.begin(), range.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t pp = idx / blocked_num_classes;
-        const std::size_t c = idx % blocked_num_classes;
-
-        const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t pp_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
 
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
         // iterate over all features
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
-            // perform the feature reduction calculation
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+            // perform the dot product calculation
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                    temp[internal_pp][internal_class] += w(global_class_idx, dim) * predict_points(global_pp_idx, dim);
+                    real_type sum{ 0.0 };
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        sum += w(global_class_idx, feature_block + feature) * predict_points(global_pp_idx, feature_block + feature);
+                    }
+                    temp[internal_class][internal_pp] += sum;
                 }
             }
         }
 
-        // perform the dot product calculation
+        // store the result back to the w vector
         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const std::size_t device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                // calculate the indices to access the global data
+                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                if (device_global_pp_idx < device_specific_num_predict_points && global_class_idx < num_classes) {
-                    prediction(global_pp_idx, global_class_idx) = temp[internal_pp][internal_class] - rho[global_class_idx];
-                }
+                prediction(global_pp_idx, global_class_idx) = temp[internal_class][internal_pp] - rho[global_class_idx];
             }
         }
     });
@@ -166,61 +171,63 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
 
 /**
  * @brief Predict the @p predict_points_d using the @p kernel_function.
- * @tparam kernel the type of the used kernel function
+ * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  * @param[out] prediction the predicted values
  * @param[in] alpha the previously learned weights
  * @param[in] rho the previously learned bias
  * @param[in] support_vectors the support vectors
  * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
+ * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+ * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) {
+template <kernel_function_type kernel_function, typename... Args>
+inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) {
     PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size());
     PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
     PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols());
     PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+    PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
 
     // calculate constants
     const std::size_t num_classes = alpha.num_rows();
     const std::size_t num_support_vectors = support_vectors.num_rows();
-    const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE));
     const std::size_t num_features = predict_points.num_cols();
+    const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
+    const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
     const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_device_specific_num_predict_points * blocked_num_support_vectors);
+    // define the range over which should be iterated
+    std::vector<std::size_t> range(blocked_device_num_predict_points * blocked_num_support_vectors);
     std::iota(range.begin(), range.end(), 0);
 
     ::hpx::for_each(::hpx::execution::par_unseq, range.cbegin(), range.cend(), [&](const std::size_t idx) {
         // calculate the indices used in the current thread
-        const std::size_t pp = idx / blocked_num_support_vectors;
-        const std::size_t sv = idx % blocked_num_support_vectors;
-
-        const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t sv_idx = sv * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t pp_idx = (idx / blocked_num_support_vectors) * INTERNAL_BLOCK_SIZE_uz;
+        const std::size_t sv_idx = (idx % blocked_num_support_vectors) * INTERNAL_BLOCK_SIZE_uz;
 
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
         // iterate over all features
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
+        for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
             // perform the feature reduction calculation
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                    temp[internal_pp][internal_sv] += detail::feature_reduce<kernel>(support_vectors(global_sv_idx, dim),
-                                                                                     predict_points(global_pp_idx, dim));
+                    real_type sum{ 0.0 };
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        sum += detail::feature_reduce<kernel_function>(support_vectors(global_sv_idx, feature_block + feature), predict_points(global_pp_idx, feature_block + feature));
+                    }
+                    temp[internal_sv][internal_pp] += sum;
                 }
             }
         }
@@ -228,25 +235,23 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
         // update temp using the respective kernel function
         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel>(temp[internal_pp][internal_sv], kernel_function_parameter...);
+                temp[internal_sv][internal_pp] = detail::apply_kernel_function<kernel_function>(temp[internal_sv][internal_pp], kernel_function_parameter...);
             }
         }
 
         // add results to prediction
-        for (std::size_t a = 0; a < num_classes; ++a) {
+        for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const std::size_t device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                    // be sure to not perform out of bounds accesses
-                    if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) {
+                    for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
                         if (global_sv_idx == 0) {
-                            atomic_ref<real_type>{ prediction(global_pp_idx, a) } += -rho[a];
+                            atomic_ref<real_type>{ prediction(global_pp_idx, class_block + class_idx) } += -rho[class_block + class_idx];
                         }
-                        atomic_ref<real_type>{ prediction(global_pp_idx, a) } +=
-                            temp[internal_pp][internal_sv] * alpha(a, global_sv_idx);
+                        atomic_ref<real_type>{ prediction(global_pp_idx, class_block + class_idx) } += alpha(class_block + class_idx, global_sv_idx) * temp[internal_sv][internal_pp];
                     }
                 }
             }
diff --git a/src/plssvm/backends/HPX/csvm.cpp b/src/plssvm/backends/HPX/csvm.cpp
index 71f651688..4c24192dd 100644
--- a/src/plssvm/backends/HPX/csvm.cpp
+++ b/src/plssvm/backends/HPX/csvm.cpp
@@ -18,6 +18,7 @@
 #include "plssvm/detail/assert.hpp"                                                // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                     // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/logging/mpi_log_untracked.hpp"                             // plssvm::detail::log_untracked
+#include "plssvm/detail/make_unique_for_overwrite.hpp"                             // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset}
 #include "plssvm/detail/memory_size.hpp"                                           // plssvm::detail::memory_size
 #include "plssvm/detail/move_only_any.hpp"                                         // plssvm::detail::{move_only_any, move_only_any_cast}
 #include "plssvm/detail/tracking/performance_tracker.hpp"                          // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY
@@ -120,26 +121,33 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
                         // get the offset of the data points this device is responsible for
                         const std::size_t row_offset = dist.place_row_offset(0);
 
-                        std::vector<real_type> kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));  // only explicitly store the upper triangular matrix
+                        // get the number of kernel matrix entries
+                        const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0);
+
+                        // only explicitly store the upper triangular matrix
+                        auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite<real_type[]>(num_entries);
+                        // initialize kernel matrix to all zeros in parallel
+                        ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries);
+
                         const auto start = std::chrono::steady_clock::now();
                         switch (params.kernel_type) {
                             case kernel_function_type::linear:
-                                detail::device_kernel_assembly<kernel_function_type::linear>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
+                                detail::device_kernel_assembly<kernel_function_type::linear>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
                                 break;
                             case kernel_function_type::polynomial:
-                                detail::device_kernel_assembly<kernel_function_type::polynomial>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<real_type>(params.gamma), params.coef0);
+                                detail::device_kernel_assembly<kernel_function_type::polynomial>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<real_type>(params.gamma), params.coef0);
                                 break;
                             case kernel_function_type::rbf:
-                                detail::device_kernel_assembly<kernel_function_type::rbf>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                                detail::device_kernel_assembly<kernel_function_type::rbf>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                                 break;
                             case kernel_function_type::sigmoid:
-                                detail::device_kernel_assembly<kernel_function_type::sigmoid>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma), params.coef0);
+                                detail::device_kernel_assembly<kernel_function_type::sigmoid>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma), params.coef0);
                                 break;
                             case kernel_function_type::laplacian:
-                                detail::device_kernel_assembly<kernel_function_type::laplacian>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                                detail::device_kernel_assembly<kernel_function_type::laplacian>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                                 break;
                             case kernel_function_type::chi_squared:
-                                detail::device_kernel_assembly<kernel_function_type::chi_squared>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
+                                detail::device_kernel_assembly<kernel_function_type::chi_squared>(kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
                                 break;
                         }
                         const auto end = std::chrono::steady_clock::now();
@@ -200,16 +208,16 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                     break;
                 case solver_type::cg_explicit:
                     {
-                        const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::vector<real_type> &>(A.front());
+                        const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::unique_ptr<real_type[]> &>(A.front());
                         PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!");
 
                         const auto start = std::chrono::steady_clock::now();
 
-                        detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                        detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
 
                         const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
                         if (num_mirror_rows > std::size_t{ 0 }) {
-                            detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                            detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
                         }
 
                         const auto end = std::chrono::steady_clock::now();
@@ -261,6 +269,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
     });
     // wait until operation is completed
     wait.get();
+    // restore padding entries by setting them to zero
+    C.restore_padding();
 }
 
 //***************************************************//
@@ -317,6 +327,8 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
                     [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration }));
                 }
+                // restore padding entries by setting them to zero
+                w.restore_padding();
 
                 // reduce w on all MPI ranks
                 comm_.allreduce_inplace(w);
@@ -358,6 +370,9 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
     });
     // wait until operation is completed
     wait.get();
+
+    // restore padding entries by setting them to zero
+    out.restore_padding();
     return out;
 }
 

From ff892127bfd52ea44fd498a64a4df558936ebc2a Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 14 Jun 2025 16:13:41 +0200
Subject: [PATCH 47/93] Some small changes: where possible change remaining
 const to constexpr, remove superfluous braces, add missing static_casts, and
 use correct THREAD_BLOCK_SIZE_uz.

---
 .../backends/CUDA/kernel/cg_explicit/blas.cuh | 20 ++++++++--------
 .../kernel_matrix_assembly_blas.cuh           |  8 +++----
 .../backends/CUDA/kernel/predict_kernel.cuh   | 24 +++++++++----------
 .../HIP/kernel/cg_explicit/blas.hip.hpp       | 20 ++++++++--------
 .../kernel_matrix_assembly_blas.hip.hpp       |  8 +++----
 .../HIP/kernel/predict_kernel.hip.hpp         | 24 +++++++++----------
 6 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
index bacc84852..ab6c7b11b 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_explicit/blas.cuh
@@ -36,9 +36,9 @@ namespace plssvm::cuda::detail {
  */
 __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -128,9 +128,9 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t
  */
 __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -207,8 +207,8 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std:
  */
 __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -242,8 +242,8 @@ __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, rea
  */
 __global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
index bf1ee66e5..9861f2fb7 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
@@ -45,9 +45,9 @@ namespace plssvm::cuda::detail {
 template <kernel_function_type kernel_function, typename... Args>
 __global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -118,7 +118,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
                 const auto global_j_idx = device_row_offset + device_global_j_idx;
 
                 // be sure to not perform out of bounds accesses (only using the upper triangular matrix)
-                if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) {
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                     // apply the final kernel function
                     temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
index 285cdc3a6..9d20863c8 100644
--- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
+++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
@@ -36,9 +36,9 @@ namespace plssvm::cuda::detail {
  */
 __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -115,9 +115,9 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con
  */
 __global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -200,9 +200,9 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t
 template <kernel_function_type kernel_function, typename... Args>
 __global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -232,8 +232,8 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 // calculate the indices to access the global data, pays attention to coalesced memory accesses
-                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
-                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
+                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                 // store the values in the shared memory
                 pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
@@ -276,7 +276,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 // calculate the indices to access the global data, pays attention to coalesced memory accesses
-                const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
+                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                 // store the values in the shared memory
                 alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
diff --git a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp
index b2e9c8ce3..9f5821634 100644
--- a/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/cg_explicit/blas.hip.hpp
@@ -39,9 +39,9 @@ namespace plssvm::hip::detail {
  */
 __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -131,9 +131,9 @@ __global__ void device_kernel_symm(const std::size_t num_rows, const std::size_t
  */
 __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -210,8 +210,8 @@ __global__ void device_kernel_symm_mirror(const std::size_t num_rows, const std:
  */
 __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, real_type *lhs, const real_type *rhs, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -245,8 +245,8 @@ __global__ void device_kernel_inplace_matrix_add(const std::size_t num_cols, rea
  */
 __global__ void device_kernel_inplace_matrix_scale(const std::size_t num_cols, real_type *lhs, const real_type scale, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
index 97ef0798b..2bc4a230f 100644
--- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
@@ -47,9 +47,9 @@ namespace plssvm::hip::detail {
 template <kernel_function_type kernel_function, typename... Args>
 __global__ void device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -120,7 +120,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
                 const auto global_j_idx = device_row_offset + device_global_j_idx;
 
                 // be sure to not perform out of bounds accesses (only using the upper triangular matrix)
-                if ((device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx)) {
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                     // apply the final kernel function
                     temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
index 9aaba6c5e..6ba12a360 100644
--- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
@@ -38,9 +38,9 @@ namespace plssvm::hip::detail {
  */
 __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -117,9 +117,9 @@ __global__ void device_kernel_w_linear(real_type *w, const real_type *alpha, con
  */
 __global__ void device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -202,9 +202,9 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t
 template <kernel_function_type kernel_function, typename... Args>
 __global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     const auto threadIdx_x = static_cast<std::size_t>(threadIdx.x);                // current thread in block x-dimension
     const auto threadIdx_y = static_cast<std::size_t>(threadIdx.y);                // current thread in block y-dimension
@@ -234,8 +234,8 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 // calculate the indices to access the global data, pays attention to coalesced memory accesses
-                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
-                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
+                const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                 // store the values in the shared memory
                 pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
@@ -278,7 +278,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
             // load data into shared memory
             for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
                 // calculate the indices to access the global data, pays attention to coalesced memory accesses
-                const auto global_sv_idx_linear = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
+                const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                 // store the values in the shared memory
                 alpha_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS

From b4d553ab3fd22ee15814e228b1f34bee5313496c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 14 Jun 2025 16:14:13 +0200
Subject: [PATCH 48/93] Update comments.

---
 include/plssvm/backends/HPX/kernel/predict_kernel.hpp    | 2 +-
 include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
index 050425b8a..e98d09a58 100644
--- a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
@@ -156,7 +156,7 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
             }
         }
 
-        // store the result back to the w vector
+        // update the global array with the local one
         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
                 // calculate the indices to access the global data
diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index 7bea4b3c4..d8cd4a0be 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -154,7 +154,7 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
                         }
                     }
 
-                    // store the result back to the w vector
+                    // update the global array with the local one
                     for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
                             // calculate the indices to access the global data

From 4020339b1eed19e92427d3470b53dd7f72c82709 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 14 Jun 2025 16:55:53 +0200
Subject: [PATCH 49/93] Rename sv to support_vectors for better readability and
 consistency.

---
 include/plssvm/backends/CUDA/kernel/predict_kernel.cuh    | 6 +++---
 include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
index 9d20863c8..d7ebf45a3 100644
--- a/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
+++ b/include/plssvm/backends/CUDA/kernel/predict_kernel.cuh
@@ -187,7 +187,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t
  * @param[in] prediction the predicted values
  * @param[in] alpha the previously learned weights
  * @param[in] rho the previously learned biases
- * @param[in] sv the support vectors
+ * @param[in] support_vectors the support vectors
  * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
@@ -198,7 +198,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
     constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
     constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
@@ -237,7 +237,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
 
                 // store the values in the shared memory
                 pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
-                sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];                          // SoA
+                sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];             // SoA
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 
diff --git a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
index 6ba12a360..9ee22edc4 100644
--- a/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/predict_kernel.hip.hpp
@@ -189,7 +189,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t
  * @param[in] prediction the predicted values
  * @param[in] alpha the previously learned weights
  * @param[in] rho the previously learned biases
- * @param[in] sv the support vectors
+ * @param[in] support_vectors the support vectors
  * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
@@ -200,7 +200,7 @@ __global__ void device_kernel_predict_linear(real_type *prediction, const real_t
  * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
 template <kernel_function_type kernel_function, typename... Args>
-__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *sv, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
+__global__ void device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) {
     // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
     constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
     constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
@@ -239,7 +239,7 @@ __global__ void device_kernel_predict(real_type *prediction, const real_type *al
 
                 // store the values in the shared memory
                 pp_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
-                sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = sv[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];                          // SoA
+                sv_cache[threadIdx.y][internal * THREAD_BLOCK_SIZE + threadIdx.x] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];                          // SoA
             }
             __syncthreads();  // wait until all threads loaded their part of the data
 

From 39513f8a36be4c9bc821226bf32c0cd3e3323c08 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 20 Jun 2025 12:01:10 +0200
Subject: [PATCH 50/93] Update some comments.

---
 .../kernel/cg_implicit/kernel_matrix_assembly_blas.cuh   | 6 +++---
 .../cg_implicit/kernel_matrix_assembly_blas.hip.hpp      | 6 +++---
 .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp   | 5 ++++-
 .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp   | 9 ++++++---
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
index 9861f2fb7..186400757 100644
--- a/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
+++ b/include/plssvm/backends/CUDA/kernel/cg_implicit/kernel_matrix_assembly_blas.cuh
@@ -58,10 +58,10 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
 
     // calculate the indices used in the current thread
     const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
-    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
     // calculate the indices used in the current thread, pays attention to coalesced memory accesses
-    const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+    const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
     const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
     // create two shared memory arrays used for caching
@@ -117,7 +117,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
                 const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
                 const auto global_j_idx = device_row_offset + device_global_j_idx;
 
-                // be sure to not perform out of bounds accesses (only using the upper triangular matrix)
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
                 if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                     // apply the final kernel function
                     temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
diff --git a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
index 2bc4a230f..b2bee8d46 100644
--- a/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
+++ b/include/plssvm/backends/HIP/kernel/cg_implicit/kernel_matrix_assembly_blas.hip.hpp
@@ -60,10 +60,10 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
 
     // calculate the indices used in the current thread
     const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
-    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+    const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
     // calculate the indices used in the current thread, pays attention to coalesced memory accesses
-    const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+    const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
     const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
     // create two shared memory arrays used for caching
@@ -119,7 +119,7 @@ __global__ void device_kernel_assembly_symm(const real_type alpha, const real_ty
                 const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
                 const auto global_j_idx = device_row_offset + device_global_j_idx;
 
-                // be sure to not perform out of bounds accesses (only using the upper triangular matrix)
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
                 if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                     // apply the final kernel function
                     temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 78a0f93d1..d6abc8cab 100644
--- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -82,6 +82,9 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
             // create a thread private array used for internal caching
             std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
+            //*************************************************************************//
+            //                   inplace kernel matrix construction                    //
+            //*************************************************************************//
             // iterate over all features
             for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
@@ -108,7 +111,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
                     const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
                     const auto global_j_idx = device_row_offset + device_global_j_idx;
 
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
                     if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                         // apply the final kernel function
                         temp[internal_j][internal_i] = detail::apply_kernel_function<kernel_function>(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 391b9fd90..952225c06 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -75,6 +75,9 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
                         // create a thread private array used for internal caching
                         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
+                        //*************************************************************************//
+                        //                   inplace kernel matrix construction                    //
+                        //*************************************************************************//
                         // iterate over all features
                         for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
@@ -101,7 +104,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
                                 const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
                                 const auto global_j_idx = device_row_offset + device_global_j_idx;
 
-                                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
+                                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
                                 if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                                     // apply the final kernel function
                                     temp[internal_j][internal_i] = detail::apply_kernel_function<kernel_function>(temp[internal_j][internal_i], kernel_function_parameter...) + QA_cost - q[global_i_idx] - q[global_j_idx];
@@ -128,13 +131,13 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
 
                                     if (global_i_idx == global_j_idx) {
                                         // only apply once to the diagonal
-                                        for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                                        for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
 #pragma omp atomic
                                             C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);
                                         }
                                     } else {
                                         // apply it for the upper and lower triangular matrix
-                                        for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                                        for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
 #pragma omp atomic
                                             C(class_block + class_idx, global_i_idx) += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx);
                                             // symmetry

From 3ef281db60488a7cbe891f9c704afb87a041074d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 20 Jun 2025 15:25:21 +0200
Subject: [PATCH 51/93] Also use trimmed names in performance tracking output.

---
 src/plssvm/backends/SYCL/DPCPP/csvm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
index 12910a7ae..861344f5b 100644
--- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
@@ -147,7 +147,7 @@ void csvm::init(const target_platform target) {
                                           "  [{}, {}]\n",
                                           device,
                                           trimmed_device_name);
-            device_names.emplace_back(device_name);
+            device_names.emplace_back(trimmed_device_name);
         }
     }
 

From a20d76d2059ef822a97f3165c5db9193374850c7 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 20 Jun 2025 15:26:09 +0200
Subject: [PATCH 52/93] Always use a loop for the custom powi function.

---
 .../backends/SYCL/kernel/kernel_functions.hpp | 35 +++----------------
 1 file changed, 5 insertions(+), 30 deletions(-)

diff --git a/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp b/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp
index 97c5c6248..6cfa159bc 100644
--- a/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/SYCL/kernel/kernel_functions.hpp
@@ -30,42 +30,17 @@ namespace plssvm::sycl::detail {
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] inline real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//

From 1c55fb151d2e7b2f8e29a4a9ec9e8cf3c3015098 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Fri, 20 Jun 2025 15:27:18 +0200
Subject: [PATCH 53/93] The get_default_queue now honors the default target
 platform.

---
 src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp b/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp
index 28742b23f..6f14f9271 100644
--- a/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/detail/utility.cpp
@@ -10,6 +10,7 @@
 
 #include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp"       // plssvm::adaptivecpp::detail::queue
 #include "plssvm/backends/SYCL/DPCPP/detail/queue_impl.hpp"  // plssvm::dpcpp::detail::queue (PImpl implementation)
+#include "plssvm/detail/assert.hpp"                          // PLSSVM_ASSERT
 #include "plssvm/detail/string_utility.hpp"                  // plssvm::detail::{as_lower_case, contains}
 #include "plssvm/detail/utility.hpp"                         // plssvm::detail::contains
 #include "plssvm/exceptions/exceptions.hpp"                  // plssvm::platform_devices_empty
@@ -101,9 +102,11 @@ void device_synchronize(const queue &q) {
 }
 
 queue get_default_queue() {
-    queue q;
-    q.impl = std::make_shared<queue::queue_impl>();
-    return q;
+     const auto &[devices, target] = detail::get_device_list(determine_default_target_platform());
+     // at least one platform must be present
+     PLSSVM_ASSERT(!devices.empty(), "At least one device must be available!");
+     // per default, use the first device for the tests
+     return devices.front();
 }
 
 std::string get_dpcpp_version() {

From b6b98fc6e5f59bc22fe4a928cdc33473a02051bf Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 21 Jun 2025 17:38:25 +0200
Subject: [PATCH 54/93] Improve the AdaptiveCpp device pointer creation
 performance on CPUs with the OpenMP backend.

---
 .../backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp    | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
index 0338d10c9..44a9b9108 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.cpp
@@ -12,6 +12,7 @@
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue_impl.hpp"  // plssvm::adaptivecpp::detail::queue (PImpl implementation)
 #include "plssvm/backends/SYCL/exceptions.hpp"                     // plssvm::adaptivecpp::backend_exception
 #include "plssvm/detail/assert.hpp"                                // PLSSVM_ASSERT
+#include "plssvm/detail/make_unique_for_overwrite.hpp"             // plssvm::detail::parallel_zero_memset
 #include "plssvm/matrix.hpp"                                       // plssvm::aos_matrix
 #include "plssvm/shape.hpp"                                        // plssvm::shape
 
@@ -56,7 +57,14 @@ void device_ptr<T>::memset(const int pattern, const size_type pos, const size_ty
         throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) };
     }
     const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
-    queue_.impl->sycl_queue.memset(static_cast<void *>(data_ + pos), pattern, rnum_bytes).wait();
+
+    ::sycl::queue &queue = queue_.impl->sycl_queue;
+    // using our OpenMP enhanced 0 memset functions has dramatically better performance on the OpenMP CPU backend
+    if (pattern == 0 && queue.get_device().is_cpu() && queue.get_device().get_backend() == ::sycl::backend::omp) {
+        ::plssvm::detail::parallel_zero_memset(data_ + pos, rnum_bytes / sizeof(value_type));
+    } else {
+        queue.memset(static_cast<void *>(data_ + pos), pattern, rnum_bytes).wait();
+    }
 }
 
 template <typename T>

From 88e5d80ad781c9ef9051e0de06b77896e5afb65f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 22 Jun 2025 14:00:22 +0200
Subject: [PATCH 55/93] Based on the provided CPU target architectures, set the
 correct preferred vector width. Reason: GCC and clang refuse to use AVX-512
 for Intel CPUs in their auto-vectorizers even on new Intel CPUs that fully
 support it.

---
 CMakeLists.txt | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b5c16f86..10de8e060 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -638,6 +638,37 @@ if (PLSSVM_ENABLE_LTO)
     endif ()
 endif ()
 
+########################################################################################################################
+#                          enable the requested vectorization widths for the auto-vectorizers                          #
+########################################################################################################################
+# GCC and clang both do not automatically auto-vectorize for AVX-512 (only AVX2)
+# -> enable it if "cpu:avx512" was passed as PLSSVM_TARGET_PLATFORMS
+if (PLSSVM_NUM_CPU_TARGET_ARCHS EQUAL 1)
+    if (${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx512")
+        message(STATUS "Enabling AVX512 support for the auto-vectorizers (-mprefer-vector-width=512).")
+        target_compile_options(
+                ${PLSSVM_BASE_LIBRARY_NAME}
+                PUBLIC $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang,IntelLLVM>:-mprefer-vector-width=512>>
+        )
+    elseif (${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx2" OR ${PLSSVM_CPU_TARGET_ARCHS} STREQUAL "avx")
+        message(STATUS "Enabling AVX/AVX2 support for the auto-vectorizers (-mprefer-vector-width=256).")
+        target_compile_options(
+                ${PLSSVM_BASE_LIBRARY_NAME}
+                PUBLIC $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang,IntelLLVM>:-mprefer-vector-width=256>>
+        )
+    elseif (${PLSSVM_CPU_TARGET_ARCHS} MATCHES "^sse")
+        message(STATUS "Enabling SSE for the auto-vectorizers (-mprefer-vector-width=128).")
+        target_compile_options(
+                ${PLSSVM_BASE_LIBRARY_NAME}
+                PUBLIC $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:GNU,Clang,IntelLLVM>:-mprefer-vector-width=128>>
+        )
+    else ()
+        message(FATAL_ERROR "Unrecognized CPU target architecture \"${PLSSVM_CPU_TARGET_ARCHS}\". Allowed values are: avx512, avx2, avx, sse.")
+    endif ()
+else ()
+    # automatically use the "optimal" auto-vectorizer width
+endif ()
+
 ########################################################################################################################
 #                                     check for optional and necessary dependencies                                    #
 ########################################################################################################################

From 56a0f7d7a903ceb54aa16f73051cea5800307a8c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 22 Jun 2025 14:40:27 +0200
Subject: [PATCH 56/93] Update the SYCL backend kernels. Now: some parts of the
 kernels are specialized for the CPU for better performance.

---
 .../SYCL/kernel/cg_explicit/basic/blas.hpp    |  230 ++--
 .../basic/kernel_matrix_assembly.hpp          |   77 +-
 .../kernel/cg_explicit/hierarchical/blas.hpp  |  359 +++---
 .../hierarchical/kernel_matrix_assembly.hpp   |  110 +-
 .../SYCL/kernel/cg_explicit/scoped/blas.hpp   |  353 +++---
 .../scoped/kernel_matrix_assembly.hpp         |  106 +-
 .../kernel/cg_explicit/work_group/blas.hpp    |  311 +++--
 .../work_group/kernel_matrix_assembly.hpp     |   94 +-
 .../basic/kernel_matrix_assembly_blas.hpp     |  136 ++-
 .../kernel_matrix_assembly_blas.hpp           |  237 ++--
 .../scoped/kernel_matrix_assembly_blas.hpp    |  283 +++--
 .../kernel_matrix_assembly_blas.hpp           |  183 +--
 .../kernel/predict/basic/predict_kernel.hpp   |  315 +++--
 .../predict/hierarchical/predict_kernel.hpp   |  483 +++++---
 .../kernel/predict/scoped/predict_kernel.hpp  |  513 ++++----
 .../predict/work_group/predict_kernel.hpp     |  419 ++++---
 src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp | 1071 ++++-------------
 src/plssvm/backends/SYCL/DPCPP/csvm.cpp       |  894 +++-----------
 18 files changed, 2974 insertions(+), 3200 deletions(-)

diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
index b55b374fe..4d19c4746 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::item
 
@@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::basic {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -41,11 +48,11 @@ class device_kernel_symm {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -59,33 +66,63 @@ class device_kernel_symm {
      * @param[in] idx indices representing the current point in the execution space
      */
     void operator()(::sycl::item<2> idx) const {
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (unsigned long long dim = 0; dim < (num_rows_ - row_offset_); ++dim) {
-            // perform the dot product calculation
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = j + static_cast<std::size_t>(internal_j);
-
-                    real_type A_val = 0.0;
-                    // determine on which side of the diagonal we are located
-                    if (dim < global_j) {
-                        A_val = A_[dim * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }];
-                    } else {
-                        A_val = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+        // iterate over all values using blocking
+        for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) {
+            if constexpr (target == target_platform::cpu) {
+                // perform the dot product calculation, the dim is the fastest moving index
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                            real_type A_val = 0.0;
+                            // determine on which side of the diagonal we are located
+                            if (dim_block + dim < global_j_idx) {
+                                A_val = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                            } else {
+                                A_val = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                            }
+
+                            sum += A_val * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
+                        }
+                        temp[internal_i][internal_j] += sum;
+                    }
+                }
+            } else {
+                // perform the dot product calculation, the dim is the slowest moving index
+                for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                            real_type A_val = 0.0;
+                            // determine on which side of the diagonal we are located
+                            if (dim_block + dim < global_j_idx) {
+                                A_val = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                            } else {
+                                A_val = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                            }
+
+                            temp[internal_i][internal_j] += A_val * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
+                        }
                     }
-
-                    temp[internal_i][internal_j] += A_val * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
                 }
             }
         }
@@ -93,13 +130,14 @@ class device_kernel_symm {
         // apply the (partial) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -109,8 +147,8 @@ class device_kernel_symm {
     /// @cond Doxygen_suppress
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -125,16 +163,21 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  *          Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm_mirror {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
      * @param[in] num_mirror_rows the number of rows to mirror down
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -143,12 +186,12 @@ class device_kernel_symm_mirror {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -162,25 +205,49 @@ class device_kernel_symm_mirror {
      * @param[in] idx indices representing the current point in the execution space
      */
     void operator()(::sycl::item<2> idx) const {
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_specific_num_rows_; ++dim) {
-            // perform the feature reduction calculation
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = j + static_cast<std::size_t>(internal_j);
-
-                    temp[internal_i][internal_j] += A_[(dim) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows_ - dim + global_j] * B_[(dim + row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+        // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) {
+            if constexpr (target == target_platform::cpu) {
+                // perform the dot product calculation, the dim is the fastest moving index
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                        real_type sum{ 0.0 };
+                        for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                            sum += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                                   B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];                                                                                                                         // SoA
+                        }
+                        temp[internal_i][internal_j] += sum;
+                    }
+                }
+            } else {
+                // perform the dot product calculation, the dim is the slowest moving index
+                for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                            temp[internal_i][internal_j] += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                                                            B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];                                                                                                                         // SoA
+                        }
+                    }
                 }
             }
         }
@@ -188,13 +255,14 @@ class device_kernel_symm_mirror {
         // apply the (remaining) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto partial_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -205,8 +273,8 @@ class device_kernel_symm_mirror {
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
     const std::size_t num_mirror_rows_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -223,6 +291,9 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in both matrices
@@ -244,19 +315,21 @@ class device_kernel_inplace_matrix_add {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx];  // SoA
             }
         }
     }
@@ -277,6 +350,9 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in the matrix
@@ -298,19 +374,21 @@ class device_kernel_inplace_matrix_scale {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_;  // SoA
             }
         }
     }
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
index 22b24bae0..f808c56fc 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
@@ -14,8 +14,10 @@
 #pragma once
 
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::item
 
@@ -27,12 +29,16 @@ namespace plssvm::sycl::detail::basic {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[out] kernel_matrix the calculated kernel matrix
@@ -60,7 +66,7 @@ class device_kernel_assembly {
         cost_{ cost },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } {
     }
 
     /**
@@ -74,22 +80,45 @@ class device_kernel_assembly {
         constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
         // only calculate the upper triangular matrix
-        if (i >= j) {
+        if (i_idx >= j_idx) {
             // create a private memory array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-            for (std::size_t dim = 0; dim < num_features_; ++dim) {
-                // perform the feature reduction calculation
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        const auto global_i = device_row_offset_ + i + static_cast<std::size_t>(internal_i);
-                        const auto global_j = device_row_offset_ + j + static_cast<std::size_t>(internal_j);
-                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i],
-                                                                                                data_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]);
+            // iterate over all features using blocking
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the feature reduction calculation, the feature is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                sum += detail::feature_reduce<kernel_function>(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                               data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the feature reduction calculation, the feature is the slowest moving index
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
+
+                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                                        data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                            }
+                        }
                     }
                 }
             }
@@ -97,23 +126,23 @@ class device_kernel_assembly {
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    // calculate the indices to access the global data points and wrt the current device
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_i = device_row_offset_ + device_global_i;
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                    const auto global_j = device_row_offset_ + device_global_j;
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
                         real_type temp_ij = temp[internal_i][internal_j];
                         // apply the final kernel function
-                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp_ij += cost_;
                         }
                         // update the upper triangular kernel matrix
-                        kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                        kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                     }
                 }
             }
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
index 5e5803652..627eaadbe 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
 
@@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::hierarchical {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -41,11 +48,11 @@ class device_kernel_symm {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -59,36 +66,15 @@ class device_kernel_symm {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> i{ group };
-        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> j{ group };
-        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
+        // create two local memory arrays used for caching
+        real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
-        // initialize private and local variables
+        // initialize private temp matrix to zero
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
             // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -97,30 +83,44 @@ class device_kernel_symm {
             }
         });
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+        // iterate over all values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
             // load data into local memory
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                const std::size_t threadIdx_x = idx.get_local_id(0);
+                // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rhs
+                const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                    // store the values in the local memory
                     // determine on which side of the diagonal we are located
-                    if (dim + threadIdx_x < global_j) {
-                        A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    if (dim_block + threadIdx_x < global_j_idx_linear) {
+                        A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                     } else {
-                        A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                        A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                     }
 
-                    B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
                 }
             });
 
@@ -128,13 +128,28 @@ class device_kernel_symm {
 
             // perform the dot product calculation
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                            temp(idx)[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
                         }
                     }
                 }
@@ -145,17 +160,31 @@ class device_kernel_symm {
 
         // apply the (partial) BLAS operation and update C
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices to access the global data, pays attention to coalesced memory accesses
+            const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+            const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                    const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                    const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses
-                    if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
-                        C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses
+                    if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) {
+                        C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                     }
                 }
             }
@@ -166,8 +195,8 @@ class device_kernel_symm {
     /// @cond Doxygen_suppress
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -182,16 +211,21 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  *          Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm_mirror {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
      * @param[in] num_mirror_rows the number of rows to mirror down
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -200,12 +234,12 @@ class device_kernel_symm_mirror {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -219,36 +253,15 @@ class device_kernel_symm_mirror {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type A_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type B_cache_[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> i{ group };
-        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> j{ group };
-        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
+        // create two local memory arrays used for caching
+        real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
-        // initialize private and local variables
+        // initialize private temp matrix to zero
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices and diagonal condition
-            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
-            // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                     temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
@@ -256,39 +269,67 @@ class device_kernel_symm_mirror {
             }
         });
 
-        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
-            // load data into shared memory
+        // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+        for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+            // load data into local memory
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                const std::size_t threadIdx_x = idx.get_local_id(0);
+                // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
 
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                     // store the values in the local memory
-                    A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
-                    B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                    B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                                // SoA
                 }
             });
 
             // implicit barrier
 
-            // perform the feature reduction calculation
+            // perform the dot product calculation
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp(idx)[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                            temp(idx)[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
                         }
                     }
                 }
@@ -299,17 +340,31 @@ class device_kernel_symm_mirror {
 
         // apply the (remaining) BLAS operation and update C
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices to access the global data
+            const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
 
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                    const auto partial_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                    const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses
-                    if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
-                        C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses
+                    if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) {
+                        C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                     }
                 }
             }
@@ -321,8 +376,8 @@ class device_kernel_symm_mirror {
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
     const std::size_t num_mirror_rows_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -339,6 +394,9 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in both matrices
@@ -361,25 +419,27 @@ class device_kernel_inplace_matrix_add {
     void operator()(::sycl::group<2> group) const {
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
             // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-            const std::size_t threadIdx_x = idx.get_local_id(0);
-            const std::size_t threadIdx_y = idx.get_local_id(1);
-            const std::size_t blockDim_x = idx.get_local_range(0);
-            const std::size_t blockDim_y = idx.get_local_range(1);
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-            // indices
-            const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-
-            for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
-                for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
-                    const std::size_t global_i = i + internal_i;
-                    const std::size_t global_j = j + internal_j;
-
-                    lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices used in the current work-item
+            const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+            const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                    lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx];  // SoA
                 }
             }
         });
@@ -401,6 +461,9 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in the matrix
@@ -423,25 +486,27 @@ class device_kernel_inplace_matrix_scale {
     void operator()(::sycl::group<2> group) const {
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
             // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-            const std::size_t threadIdx_x = idx.get_local_id(0);
-            const std::size_t threadIdx_y = idx.get_local_id(1);
-            const std::size_t blockDim_x = idx.get_local_range(0);
-            const std::size_t blockDim_y = idx.get_local_range(1);
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-            // indices
-            const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-
-            for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
-                for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
-                    const std::size_t global_i = i + internal_i;
-                    const std::size_t global_j = j + internal_j;
-
-                    lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices used in the current work-item
+            const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+            const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                    lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_;  // SoA
                 }
             }
         });
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
index d3e37ca54..3bc6d0878 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
@@ -14,8 +14,10 @@
 #pragma once
 
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
 
@@ -28,12 +30,16 @@ namespace plssvm::sycl::detail::hierarchical {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[out] kernel_matrix the calculated kernel matrix
@@ -61,7 +67,7 @@ class device_kernel_assembly {
         cost_{ cost },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } {
     }
 
     /**
@@ -69,7 +75,7 @@ class device_kernel_assembly {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // create two local memory arrays used for caching data point features
+        // create two local memory arrays used for caching
         real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
         real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
@@ -78,8 +84,17 @@ class device_kernel_assembly {
 
         // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
         if (group[1] >= group[0]) {
+            // initialize private temp matrix to zero
+            group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
+                    }
+                }
+            });
+
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                 // load data into local memory
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                     // cast values to 32-bit unsigned int values to prevent implicit conversions
@@ -91,25 +106,25 @@ class device_kernel_assembly {
                     constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
                     constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                    const std::size_t threadIdx_x = idx.get_local_id(0);       // current work-item in work-group x-dimension
-                    const std::size_t threadIdx_y = idx.get_local_id(1);       // current work-item in work-group y-dimension
-                    const std::size_t blockDim_x = idx.get_local_range(0);     // number of work-items in work-group x-dimension
-                    const std::size_t blockDim_y = idx.get_local_range(1);     // number of work-items in work-group y-dimension
-                    const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
-                    const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+                    const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                    const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                    const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                    const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                    const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                    const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                    // calculate the indices used in the current work-item paying attention to coalesced memory accesses
-                    const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                    const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                    // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+                    const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rows - device_row_offset
+                    const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        // calculate the indices to access the global data points, pays attention to coalesced memory accesses
-                        const auto global_i_linear = device_row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                        const auto global_j_linear = device_row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                         // store the values in the local memory
-                        data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
-                        data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
+                        data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                        data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                     }
                 });
 
@@ -121,11 +136,26 @@ class device_kernel_assembly {
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                             data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                   data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
+                                temp(idx)[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                 data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
                             }
                         }
                     }
@@ -140,36 +170,36 @@ class device_kernel_assembly {
                 constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
                 constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                const std::size_t threadIdx_x = idx.get_local_id(0);       // current work-item in work-group x-dimension
-                const std::size_t threadIdx_y = idx.get_local_id(1);       // current work-item in work-group y-dimension
-                const std::size_t blockDim_x = idx.get_local_range(0);     // number of work-items in work-group x-dimension
-                const std::size_t blockDim_y = idx.get_local_range(1);     // number of work-items in work-group y-dimension
-                const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
-                const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+                const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
                 // calculate the indices used in the current work-item
-                const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+                const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        // calculate the indices to access the global data points and wrt the current device
-                        const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                        const auto global_i = device_row_offset_ + device_global_i;
-                        const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                        const auto global_j = device_row_offset_ + device_global_j;
-
-                        // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                        if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                        // calculate the indices to access the global data and the data with respect to the current device
+                        const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                        const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                        const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                        // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                        if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
                             real_type temp_ij = temp(idx)[internal_i][internal_j];
                             // apply the final kernel function
-                            temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                            temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                             // apply the cost on the diagonal
-                            if (global_i == global_j) {
+                            if (global_i_idx == global_j_idx) {
                                 temp_ij += cost_;
                             }
                             // update the upper triangular kernel matrix
-                            kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                            kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                         }
                     }
                 }
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
index 2e6983255..9d3d6bef8 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
 
@@ -24,15 +26,20 @@ namespace plssvm::sycl::detail::scoped {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -41,11 +48,11 @@ class device_kernel_symm {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -62,85 +69,111 @@ class device_kernel_symm {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
+                                   // the indices used in the current work-item
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // A_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // B_cache
 
-                                       for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
-                                           // load data into shared memory
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &A_cache, auto &B_cache, auto &temp) {
+                                       // iterate over all values using blocking to be able to cache them for faster memory accesses
+                                       for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                           // load data into local memory
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                               // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                               constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                               constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                               const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                               const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                               const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                               const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                               const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                               // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                               const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rhs
+                                               const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
                                                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                   const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                   const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                   const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                                                   // store the values in the local memory
                                                    // determine on which side of the diagonal we are located
-                                                   if (dim + threadIdx_x < global_j) {
-                                                       A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                                   if (dim_block + threadIdx_x < global_j_idx_linear) {
+                                                       A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                                                    } else {
-                                                       A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                                       A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                                                    }
 
-                                                   B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
                                                }
                                            });
 
-                                           // perform calculations
+                                           // perform the dot product calculation
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                               if constexpr (target == target_platform::cpu) {
+                                                   // perform the dot product calculation, the dim is the fastest moving index
                                                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                           temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           real_type sum{ 0.0 };
+                                                           for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                                               sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           }
+                                                           temp(idx)[internal_i][internal_j] += sum;
+                                                       }
+                                                   }
+                                               } else {
+                                                   // perform the dot product calculation, the dim is the slowest moving index
+                                                   for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                                       for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                           for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                               temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           }
                                                        }
                                                    }
                                                }
                                            });
                                        }
 
+                                       // apply the (partial) BLAS operation and update C
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                           const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+                                           const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
                                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                   const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                                                   const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                                                   const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
-
-                                                   // be sure to not perform out of bounds accesses
-                                                   if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
-                                                       C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   // calculate the indices to access the global data and the data with respect to the current device
+                                                   const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                                   const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                                                   const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                                                   // be sure to not perform out-of-bounds accesses
+                                                   if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) {
+                                                       C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                                                    }
                                                }
                                            }
@@ -152,8 +185,8 @@ class device_kernel_symm {
     /// @cond Doxygen_suppress
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -168,16 +201,21 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  *          Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm_mirror {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
      * @param[in] num_mirror_rows the number of rows to mirror down
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -186,12 +224,12 @@ class device_kernel_symm_mirror {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -208,80 +246,105 @@ class device_kernel_symm_mirror {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &A_cache, auto &B_cache, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
+                                   // the indices used in the current work-item
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // A_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // B_cache
 
-                                       for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
-                                           // load data into shared memory
+                                   // create a private memory array used for internal caching
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &A_cache, auto &B_cache, auto &temp) {
+                                       // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+                                       for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                           // load data into local memory
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                               // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                               constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                               constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                               const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                               const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                               const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                               const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                               const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                               // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                               const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                               const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
 
                                                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                   const auto global_i = i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                   const auto global_j = j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                   const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                                                    // store the values in the local memory
-                                                   A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
-                                                   B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   A_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                                                   B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                                // SoA
                                                }
                                            });
 
-                                           // perform calculations
+                                           // perform the dot product calculation
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                               if constexpr (target == target_platform::cpu) {
+                                                   // perform the dot product calculation, the dim is the fastest moving index
                                                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                           temp(idx)[internal_i][internal_j] += A_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           real_type sum{ 0.0 };
+                                                           for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                                               sum += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           }
+                                                           temp(idx)[internal_i][internal_j] += sum;
+                                                       }
+                                                   }
+                                               } else {
+                                                   // perform the dot product calculation, the dim is the slowest moving index
+                                                   for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                                       for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                           for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                               temp(idx)[internal_i][internal_j] += A_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                                                           }
                                                        }
                                                    }
                                                }
                                            });
                                        }
 
+                                       // apply the (remaining) BLAS operation and update C
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices to access the global data
+                                           const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
 
                                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                   const auto global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                                                   const auto partial_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-                                                   const auto global_j = row_offset_ + device_specific_num_rows_ + j(idx) + static_cast<std::size_t>(internal_j);
-
-                                                   // be sure to not perform out of bounds accesses
-                                                   if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
-                                                       C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                                                   // calculate the indices to access the global data and the data with respect to the current device
+                                                   const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                                   const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                                                   const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx;
+
+                                                   // be sure to not perform out-of-bounds accesses
+                                                   if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) {
+                                                       C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp(idx)[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                                                    }
                                                }
                                            }
@@ -294,8 +357,8 @@ class device_kernel_symm_mirror {
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
     const std::size_t num_mirror_rows_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -312,6 +375,9 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in both matrices
@@ -336,28 +402,29 @@ class device_kernel_inplace_matrix_add {
     void operator()(T group) const {
         ::sycl::memory_environment(group,
                                    [&]() {
-                                       // scale
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-                                           // indices
-                                           const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices used in the current work-item
+                                           const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+                                           const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
                                            for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
                                                for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
-                                                   const std::size_t global_i = i + internal_i;
-                                                   const std::size_t global_j = j + internal_j;
+                                                   // calculate the indices to access the global data
+                                                   const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                                   const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                                                   lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+                                                   lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx];  // SoA
                                                }
                                            }
                                        });
@@ -380,6 +447,9 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in the matrix
@@ -404,28 +474,29 @@ class device_kernel_inplace_matrix_scale {
     void operator()(T group) const {
         ::sycl::memory_environment(group,
                                    [&]() {
-                                       // scale
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-                                           // indices
-                                           const std::size_t i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           const std::size_t j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                           for (std::size_t internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE_uz; ++internal_i) {
-                                               for (std::size_t internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE_uz; ++internal_j) {
-                                                   const std::size_t global_i = i + internal_i;
-                                                   const std::size_t global_j = j + internal_j;
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices used in the current work-item
+                                           const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+                                           const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+
+                                           for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                               for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                   // calculate the indices to access the global data
+                                                   const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                                   const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                                                   lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+                                                   lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_;  // SoA
                                                }
                                            }
                                        });
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
index 33c725a46..b882cdead 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
@@ -14,8 +14,10 @@
 #pragma once
 
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
 
@@ -28,12 +30,16 @@ namespace plssvm::sycl::detail::scoped {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[out] kernel_matrix the calculated kernel matrix
@@ -61,7 +67,7 @@ class device_kernel_assembly {
         cost_{ cost },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } {
     }
 
     /**
@@ -72,14 +78,17 @@ class device_kernel_assembly {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),            // data_i_cache
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),            // data_j_cache
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // data_i_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // data_j_cache
+
+                                   // create a private memory array used for internal caching
                                    ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),  // temp
                                    [&](auto &data_i_cache, auto &data_j_cache, auto &temp) {
                                        // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
                                        if (group[1] >= group[0]) {
                                            // iterate over all features using blocking to be able to cache them for faster memory accesses
-                                           for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                           for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                // load data into local memory
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                    // cast values to 32-bit unsigned int values to prevent implicit conversions
@@ -91,25 +100,25 @@ class device_kernel_assembly {
                                                    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
                                                    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current work-item in work-group x-dimension
-                                                   const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current work-item in work-group y-dimension
-                                                   const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of work-items in work-group x-dimension
-                                                   const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of work-items in work-group y-dimension
-                                                   const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
-                                                   const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+                                                   const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                                   const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                                   const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                                   const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                                   const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                                   const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
 
-                                                   // calculate the indices used in the current work-item paying attention to coalesced memory accesses
-                                                   const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                                   const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                                   // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+                                                   const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rows - device_row_offset
+                                                   const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
                                                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                       // calculate the indices to access the global data points, pays attention to coalesced memory accesses
-                                                       const auto global_i_linear = device_row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                       const auto global_j_linear = device_row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                       // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                       const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                       const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                                                        // store the values in the local memory
-                                                       data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
-                                                       data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
+                                                       data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                                                       data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                                                    }
                                                });
 
@@ -119,11 +128,26 @@ class device_kernel_assembly {
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                                   if constexpr (target == target_platform::cpu) {
+                                                       // perform the feature reduction calculation, the feature is the fastest moving index
                                                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                               temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                                                            data_j_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                               real_type sum{ 0.0 };
+                                                               for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                                   sum += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                  data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                               }
+                                                               temp(idx)[internal_i][internal_j] += sum;
+                                                           }
+                                                       }
+                                                   } else {
+                                                       // perform the feature reduction calculation, the feature is the slowest moving index
+                                                       for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                           for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                               for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                                   temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                                                data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                               }
                                                            }
                                                        }
                                                    }
@@ -136,36 +160,36 @@ class device_kernel_assembly {
                                                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
                                                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current work-item in work-group x-dimension
-                                               const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current work-item in work-group y-dimension
-                                               const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of work-items in work-group x-dimension
-                                               const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of work-items in work-group y-dimension
-                                               const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
-                                               const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+                                               const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                               const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                               const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                               const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                               const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
 
                                                // calculate the indices used in the current work-item
-                                               const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                               const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                               const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+                                               const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
                                                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                       // calculate the indices to access the global data points and wrt the current device
-                                                       const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                                                       const auto global_i = device_row_offset_ + device_global_i;
-                                                       const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                                                       const auto global_j = device_row_offset_ + device_global_j;
-
-                                                       // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                                                       if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                                                       // calculate the indices to access the global data and the data with respect to the current device
+                                                       const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                                       const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                                                       const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                                                       const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                                                       // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                                                       if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
                                                            real_type temp_ij = temp(idx)[internal_i][internal_j];
                                                            // apply the final kernel function
-                                                           temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                                                           temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                                                            // apply the cost on the diagonal
-                                                           if (global_i == global_j) {
+                                                           if (global_i_idx == global_j_idx) {
                                                                temp_ij += cost_;
                                                            }
                                                            // update the upper triangular kernel matrix
-                                                           kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                                                           kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                                                        }
                                                    }
                                                }
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
index 965b043a3..5c0949c34 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
@@ -13,7 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
@@ -24,16 +26,21 @@ namespace plssvm::sycl::detail::work_group {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -42,13 +49,13 @@ class device_kernel_symm {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         A_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
         B_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -67,64 +74,85 @@ class device_kernel_symm {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += THREAD_BLOCK_SIZE_uz) {
-            // load data into local memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+        {
+            // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rhs
+            const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
+
+            // iterate over all values using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the local memory
+                    // determine on which side of the diagonal we are located
+                    if (dim_block + threadIdx_x < global_j_idx_linear) {
+                        A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_x) * (dim_block + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                    } else {
+                        A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_x - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                    }
 
-                // determine on which side of the diagonal we are located
-                if (dim + threadIdx_x < global_j) {
-                    A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + global_j - (dim + threadIdx_x) * (dim + threadIdx_x + std::size_t{ 1 }) / std::size_t{ 2 }];
-                } else {
-                    A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_uz) + dim + threadIdx_x - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim_block + device_row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
                 }
-
-                B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-            }
-            nd_idx.barrier();  // wait until all work-items loaded their part of the data
-
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                nd_idx.barrier();  // wait until all work-items loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                        }
                     }
                 }
+                nd_idx.barrier();  // wait until all work-items performed their part of the calculations
             }
-            nd_idx.barrier();  // wait until all work-items performed their part of the calculations
         }
 
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
         // apply the (partial) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -139,8 +167,8 @@ class device_kernel_symm {
     /// @cond Doxygen_suppress
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -155,17 +183,22 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  *          Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_symm_mirror {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
      * @param[in] num_mirror_rows the number of rows to mirror down
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -174,14 +207,14 @@ class device_kernel_symm_mirror {
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+    device_kernel_symm_mirror(::sycl::handler &cgh, const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const real_type *B, const real_type beta, real_type *C, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
         A_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
         B_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -200,59 +233,79 @@ class device_kernel_symm_mirror {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += THREAD_BLOCK_SIZE_uz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                // store the values in the local memory
-                A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim + threadIdx_x) * (num_rows_ - row_offset_ + PADDING_SIZE_uz) - (dim + threadIdx_x - std::size_t{ 1 }) * (dim + threadIdx_x) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_x) + global_j];
-                B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(dim + row_offset_ + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i];
-            }
-            nd_idx.barrier();  // wait until all threads loaded their part of the data
-
-            // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += A_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rhs
+            const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_mirror_rows
+
+            // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the local memory
+                    A_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = A_[(dim_block + threadIdx_x) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_x - std::size_t{ 1 }) * (dim_block + threadIdx_x) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_x) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                    B_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = B_[(device_row_offset_ + dim_block + threadIdx_x) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                                // SoA
+                }
+                nd_idx.barrier();  // wait until all work-items loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += A_cache_[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache_[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i];
+                            }
+                        }
                     }
                 }
+                nd_idx.barrier();  // wait until all work-items performed their part of the calculations
             }
-            nd_idx.barrier();  // wait until all threads performed their part of the calculations
         }
 
+        // calculate the indices used in the current work-item
+        const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
+
         // apply the (remaining) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto partial_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_uz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -268,8 +321,8 @@ class device_kernel_symm_mirror {
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
     const std::size_t num_mirror_rows_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     const real_type *A_;
     const real_type *B_;
@@ -286,6 +339,9 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in both matrices
@@ -307,25 +363,27 @@ class device_kernel_inplace_matrix_add {
      */
     void operator()(::sycl::nd_item<2> nd_idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // # num_rows
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // # num_rhs
+        const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j];
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx];  // SoA
             }
         }
     }
@@ -346,6 +404,9 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] num_cols the number of columns in the matrix
@@ -367,25 +428,27 @@ class device_kernel_inplace_matrix_scale {
      */
     void operator()(::sycl::nd_item<2> nd_idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // # num_rows
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // # num_rhs
+        const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_uz) + global_j] *= scale_;
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_;  // SoA
             }
         }
     }
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
index 560d556ea..ec9fc1773 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
@@ -14,6 +14,7 @@
 #pragma once
 
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -28,13 +29,16 @@ namespace plssvm::sycl::detail::work_group {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @details Uses SYCL's work-group data parallel kernels.
- * @details target the target platform
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
@@ -65,7 +69,7 @@ class device_kernel_assembly {
         cost_{ cost },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } {
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } {
     }
 
     /**
@@ -82,12 +86,12 @@ class device_kernel_assembly {
         constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
         constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current work-item in work-group x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current work-item in work-group y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of work-items in work-group x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of work-items in work-group y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
         if (blockIdx_y >= blockIdx_x) {
@@ -95,75 +99,75 @@ class device_kernel_assembly {
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
             {
-                // calculate the indices used in the current work-item paying attention to coalesced memory accesses
-                const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+                const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rows - device_row_offset
+                const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
                 // iterate over all features using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) {
+                for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        // calculate the indices to access the global data points, pays attention to coalesced memory accesses
-                        const auto global_i_linear = device_row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                        const auto global_j_linear = device_row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                         // store the values in the local memory
-                        data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
-                        data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
+                        data_i_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                        data_j_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-                    if constexpr (target == target_platform::gpu_amd) {
-                        // perform the feature reduction calculation, the block_dim is the slowest moving index
-                        for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                            data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
-                                }
-                            }
-                        }
-                    } else {
-                        // perform the feature reduction calculation, the block_dim is the fastest moving index
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                                 real_type sum{ 0.0 };
-                                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                                    sum += detail::feature_reduce<kernel_function>(data_i_cache_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                   data_j_cache_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                   data_j_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
                                 }
                                 temp[internal_i][internal_j] += sum;
                             }
                         }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                            data_j_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
+                            }
+                        }
                     }
                     nd_idx.barrier();  // wait until all work-items performed their part of the calculations
                 }
             }
 
             // calculate the indices used in the current work-item
-            const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+            const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+            const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    // calculate the indices to access the global data points and wrt the current device
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_i = device_row_offset_ + device_global_i;
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                    const auto global_j = device_row_offset_ + device_global_j;
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
                         real_type temp_ij = temp[internal_i][internal_j];
                         // apply the final kernel function
-                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp_ij += cost_;
                         }
                         // update the upper triangular kernel matrix
-                        kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                        kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                     }
                 }
             }
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
index 9c82ad31d..c07186c37 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
@@ -15,8 +15,10 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::item
 
@@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::basic {
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] alpha the scalar alpha value
      * @param[in] q the vector used in the dimensional reduction
-     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] data the data points to calculate the implicit kernel matrix from
      * @param[in] num_rows the total number of data points (= total number of rows)
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @param[in] cost the cost factor the diagonal is scaled with
@@ -52,13 +58,13 @@ class device_kernel_assembly_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
         alpha_{ alpha },
         q_{ q },
-        data_d_{ data_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         QA_cost_{ QA_cost },
         cost_{ cost },
@@ -67,7 +73,7 @@ class device_kernel_assembly_symm {
         num_classes_{ num_classes },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -75,28 +81,53 @@ class device_kernel_assembly_symm {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t i = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t j = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto i_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
         // only calculate the upper triangular matrix
-        if (i >= j) {
+        if (i_idx >= j_idx) {
             // create a work-item private array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; ++dim) {
-                // perform the feature reduction calculation
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                        temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i],
-                                                                                                data_d_[dim * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j]);
+            //*************************************************************************//
+            //                   inplace kernel matrix construction                    //
+            //*************************************************************************//
+            // iterate over all features using blocking
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the feature reduction calculation, the feature is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                sum += detail::feature_reduce<kernel_function>(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                               data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the feature reduction calculation, the feature is the slowest moving index
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
+
+                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                                        data_[(feature_block + feature) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                            }
+                        }
                     }
                 }
             }
@@ -104,28 +135,48 @@ class device_kernel_assembly_symm {
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
-                        real_type temp_ij = temp[internal_i][internal_j];
-                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
+                        // apply the final kernel function
+                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
-                            temp_ij += cost_;
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) {
-                                detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx];
+                        if (global_i_idx == global_j_idx) {
+                            temp[internal_i][internal_j] += cost_;
+                        }
+                    } else {
+                        // be sure to set the value to zero otherwise
+                        temp[internal_i][internal_j] = real_type{ 0.0 };
+                    }
+                }
+            }
+
+            //*************************************************************************//
+            //                     calculate C += alpha * temp * B                     //
+            //*************************************************************************//
+            for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
+
+                        if (global_i_idx == global_j_idx) {
+                            // only apply once to the diagonal
+                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                detail::atomic_op<real_type>{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx];
                             }
                         } else {
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes_; ++class_idx) {
-                                detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx];
+                            // apply it for the upper and lower triangular matrix
+                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                detail::atomic_op<real_type>{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx];
                                 // symmetry
-                                detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + class_idx] } += alpha_ * temp_ij * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + class_idx];
+                                detail::atomic_op<real_type>{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += alpha_ * temp[internal_i][internal_j] * B_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx];
                             }
                         }
                     }
@@ -137,11 +188,12 @@ class device_kernel_assembly_symm {
   private:
     /// @cond Doxygen_suppress
     const real_type alpha_;
+
     const real_type *q_;
-    const real_type *data_d_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type QA_cost_;
     const real_type cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
index 342e8308b..ea9197444 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
@@ -15,8 +15,10 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
 
@@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::hierarchical {
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] alpha the scalar alpha value
      * @param[in] q the vector used in the dimensional reduction
-     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] data the data points to calculate the implicit kernel matrix from
      * @param[in] num_rows the total number of data points (= total number of rows)
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @param[in] cost the cost factor the diagonal is scaled with
@@ -52,13 +58,13 @@ class device_kernel_assembly_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
         alpha_{ alpha },
         q_{ q },
-        data_d_{ data_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         QA_cost_{ QA_cost },
         cost_{ cost },
@@ -67,41 +73,45 @@ class device_kernel_assembly_symm {
         num_classes_{ num_classes },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> i{ group };
-        ::sycl::private_memory<std::size_t, 2> i_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> j{ group };
-        ::sycl::private_memory<std::size_t, 2> j_linear{ group };
+        // the indices used in the current work-item
+        ::sycl::private_memory<std::size_t, 2> i_idx{ group };  // num_rows - device_row_offset
+        ::sycl::private_memory<std::size_t, 2> j_idx{ group };  // device_num_rows
+
+        ::sycl::private_memory<std::size_t, 2> i_idx_linear{ group };  // num_rows - device_row_offset
+        ::sycl::private_memory<std::size_t, 2> j_idx_linear{ group };  // device_num_rows
 
-        // create the shared memory arrays used for caching data point features
-        real_type data_cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        // create two local memory arrays used for caching
+        real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
         // initialize private and local variables
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices to access the global data
+            i_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            j_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+            // calculate the indices to access the global data, pays attention to coalesced memory accesses
+            i_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+            j_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
 
             // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
@@ -115,30 +125,36 @@ class device_kernel_assembly_symm {
 
         // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
         if (group[1] >= group[0]) {
+            //*************************************************************************//
+            //                   inplace kernel matrix construction                    //
+            //*************************************************************************//
             {
-                // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-                auto data_cache_i = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(data_cache_one);
-                auto data_cache_j = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(data_cache_two);
+                // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto data_i_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+                auto data_j_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
                 // iterate over all features using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                     // load data into local memory
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        const std::size_t threadIdx_x = idx.get_local_id(0);
+                        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                        const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));  // current work-item in work-group x-dimension
 
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                            const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                            const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                            // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                            const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                            const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                             // store the values in the local memory
-                            data_cache_i[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                            data_cache_j[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                            data_i_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                            data_j_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                         }
                     });
 
@@ -146,14 +162,30 @@ class device_kernel_assembly_symm {
 
                     // perform the feature reduction calculation
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                        if constexpr (target == target_platform::cpu) {
+                            // perform the feature reduction calculation, the feature is the fastest moving index
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                    temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                                 data_cache_j[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                    real_type sum{ 0.0 };
+                                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                        sum += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                       data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                    }
+                                    temp(idx)[internal_i][internal_j] += sum;
+                                }
+                            }
+                        } else {
+                            // perform the feature reduction calculation, the feature is the slowest moving index
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                        temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                     data_j_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                    }
                                 }
                             }
                         }
@@ -167,16 +199,18 @@ class device_kernel_assembly_symm {
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                        const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                        const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
-                        const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-
-                        // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                        if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
-                            temp(idx)[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        // calculate the indices to access the global data and the data with respect to the current device
+                        const auto device_global_i_idx = i_idx(idx) + static_cast<std::size_t>(internal_i);
+                        const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                        const auto device_global_j_idx = j_idx(idx) + static_cast<std::size_t>(internal_j);
+                        const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                        // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                        if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
+                            // apply the final kernel function
+                            temp(idx)[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                             // apply the cost on the diagonal
-                            if (global_i == global_j) {
+                            if (global_i_idx == global_j_idx) {
                                 temp(idx)[internal_i][internal_j] += cost_;
                             }
                         } else {
@@ -189,38 +223,44 @@ class device_kernel_assembly_symm {
 
             // implicit group barrier
 
-            // calculate C += alpha * temp * B for the UPPER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+            //*************************************************************************//
             {
-                // reinterpret the arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
-                auto B_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(data_cache_one);
-                auto C_out_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(data_cache_two);
+                // reinterpret the local memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                auto B_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_one);
+                auto C_out_cache = reinterpret_cast<real_type(*)[THREAD_BLOCK_SIZE]>(cache_two);
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                     // load data into local memory
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        const std::size_t threadIdx_x = idx.get_local_id(0);
+                        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                        const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));  // current work-item in work-group x-dimension
 
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                            const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                            // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                            const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                             // store the values in the local memory
-                            B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                            C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 };
+                            B_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
+                            C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_1][local_id_0] = real_type{ 0.0 };                                                                             // SoA
                         }
                     });
 
                     // implicit group barrier
 
-                    // calculate intermediate results and store them in shared memory
+                    // calculate intermediate results and store them in local memory
                     for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                            // cast values to 32-bit unsigned int values to prevent implicit conversions
                             const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                             const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
@@ -235,18 +275,22 @@ class device_kernel_assembly_symm {
                         // implicit group barrier
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add the intermediate cached results to the C matrix
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        const std::size_t threadIdx_y = idx.get_local_id(1);
+                        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                        const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));  // current work-item in work-group y-dimension
 
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                            const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal);
-                            detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1];
+                            // calculate the indices to access the global data
+                            const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast<std::size_t>(internal);
+
+                            detail::atomic_op<real_type>{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal][local_id_1];  // SoA
                         }
                     });
 
@@ -258,10 +302,11 @@ class device_kernel_assembly_symm {
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                        const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast<std::size_t>(internal_i);
+                        const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast<std::size_t>(internal_j);
 
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
                         }
                     }
@@ -270,38 +315,44 @@ class device_kernel_assembly_symm {
 
             // implicit group barrier
 
-            // calculate C += alpha * temp * B for the LOWER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+            //*************************************************************************//
             {
-                // reinterpret the arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-                auto B_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(data_cache_one);
-                auto C_out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(data_cache_two);
+                // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto B_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_one);
+                auto C_out_cache = reinterpret_cast<real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(cache_two);
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        const std::size_t threadIdx_x = idx.get_local_id(0);
+                        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                        const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));  // current work-item in work-group x-dimension
 
                         // load data into local memory
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                            const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                            // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                            const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                            // store the values in the shared memory
-                            B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
+                            // store the values in the local memory
+                            B_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
                             C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                         }
                     });
 
                     // implicit group barrier
 
-                    // calculate intermediate results and store them in shared memory
+                    // calculate intermediate results and store them in local memory
                     for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                            // cast values to 32-bit unsigned int values to prevent implicit conversions
                             const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                             const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
@@ -316,18 +367,22 @@ class device_kernel_assembly_symm {
                         // implicit group barrier
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add the intermediate cached results to the C matrix
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        const std::size_t threadIdx_x = idx.get_local_id(0);
+                        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                        const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));  // current work-item in work-group x-dimension
 
                         for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                            const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal);
-                            detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast<std::size_t>(internal);
+
+                            detail::atomic_op<real_type>{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];  // SoA
                         }
                     });
 
@@ -341,10 +396,10 @@ class device_kernel_assembly_symm {
     /// @cond Doxygen_suppress
     const real_type alpha_;
     const real_type *q_;
-    const real_type *data_d_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type QA_cost_;
     const real_type cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
index c84db480f..c833b19da 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
@@ -15,8 +15,10 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
 
@@ -28,20 +30,24 @@ namespace plssvm::sycl::detail::scoped {
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] alpha the scalar alpha value
      * @param[in] q the vector used in the dimensional reduction
-     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] data the data points to calculate the implicit kernel matrix from
      * @param[in] num_rows the total number of data points (= total number of rows)
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @param[in] cost the cost factor the diagonal is scaled with
@@ -52,13 +58,13 @@ class device_kernel_assembly_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+    device_kernel_assembly_symm(const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
         alpha_{ alpha },
         q_{ q },
-        data_d_{ data_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         QA_cost_{ QA_cost },
         cost_{ cost },
@@ -67,7 +73,7 @@ class device_kernel_assembly_symm {
         num_classes_{ num_classes },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -77,86 +83,124 @@ class device_kernel_assembly_symm {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
+                                   // the indices used in the current work-item
+                                   ::sycl::require_private_mem<std::size_t>(),  // num_rows - device_row_offset
+                                   ::sycl::require_private_mem<std::size_t>(),  // device_num_rows
+
+                                   ::sycl::require_private_mem<std::size_t>(),  // num_rows - device_row_offset
+                                   ::sycl::require_private_mem<std::size_t>(),  // device_num_rows
+
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // cache_one
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // cache_two
+
+                                   // create a private memory array used for internal caching
                                    ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &data_cache_i, auto &data_cache_j, auto &i, auto &i_linear, auto &j, auto &j_linear, auto &temp) {
+                                   [&](auto &i_idx, auto &j_idx, auto &i_idx_linear, auto &j_idx_linear, auto &cache_one, auto &cache_two, auto &temp) {
                                        // initialize private and local variables
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           i(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           i_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           j(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           j_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices to access the global data
+                                           i_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+                                           j_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+                                           // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                           i_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+                                           j_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
                                        });
 
-                                       // exploit symmetry
+                                       // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
                                        if (group[1] >= group[0]) {
-                                           // iterate over all features using blocking to be able to cache them for faster memory accesses
-                                           for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
-                                               // load data into local memory
-                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
-                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+                                           //*************************************************************************//
+                                           //                   inplace kernel matrix construction                    //
+                                           //*************************************************************************//
+                                           {
+                                               // rename cached arrays
+                                               auto &data_i_cache = cache_one;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                                               auto &data_j_cache = cache_two;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+
+                                               // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                               for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                                   // load data into local memory
+                                                   ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
+                                                       const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                       const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                       // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                       constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                       constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                       const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));  // current work-item in work-group x-dimension
 
-                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                       const auto global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                       const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                       for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                           // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                           const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                           const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                       // store the values in the local memory
-                                                       data_cache_i[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                                                       data_cache_j[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
-                                                   }
-                                               });
-
-                                               // perform the feature reduction calculation
-                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
-                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
-
-                                                   for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                                                       for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                                                           for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                               temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                                                            data_cache_j[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                           // store the values in the local memory
+                                                           data_i_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                                                           data_j_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                                                       }
+                                                   });
+
+                                                   // perform the feature reduction calculation
+                                                   ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
+                                                       const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                       const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                       if constexpr (target == target_platform::cpu) {
+                                                           // perform the feature reduction calculation, the feature is the fastest moving index
+                                                           for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                               for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                                   real_type sum{ 0.0 };
+                                                                   for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                                       sum += detail::feature_reduce<kernel_function>(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                      data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                                   }
+                                                                   temp(idx)[internal_i][internal_j] += sum;
+                                                               }
+                                                           }
+                                                       } else {
+                                                           // perform the feature reduction calculation, the feature is the slowest moving index
+                                                           for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                               for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                                                   for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                                                       temp(idx)[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                                                                    data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                                                   }
+                                                               }
                                                            }
                                                        }
-                                                   }
-                                               });
+                                                   });
+                                               }
                                            }
 
                                            // apply the remaining part of the kernel function and store the value in the output kernel matrix
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                       const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                                                       const auto device_global_i = i(idx) + static_cast<std::size_t>(internal_i);
-                                                       const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
-                                                       const auto device_global_j = j(idx) + static_cast<std::size_t>(internal_j);
-
-                                                       // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                                                       if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
-                                                           temp(idx)[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                                                       // calculate the indices to access the global data and the data with respect to the current device
+                                                       const auto device_global_i_idx = i_idx(idx) + static_cast<std::size_t>(internal_i);
+                                                       const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                                                       const auto device_global_j_idx = j_idx(idx) + static_cast<std::size_t>(internal_j);
+                                                       const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                                                       // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                                                       if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
+                                                           // apply the final kernel function
+                                                           temp(idx)[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                                                            // apply the cost on the diagonal
-                                                           if (global_i == global_j) {
+                                                           if (global_i_idx == global_j_idx) {
                                                                temp(idx)[internal_i][internal_j] += cost_;
                                                            }
                                                        } else {
@@ -167,36 +211,42 @@ class device_kernel_assembly_symm {
                                                }
                                            });
 
-                                           // calculate C += alpha * temp * B for the UPPER triangular matrix
+                                           //*************************************************************************//
+                                           //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+                                           //*************************************************************************//
                                            {
                                                // rename cached arrays
-                                               auto &B_cache = data_cache_i;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
-                                               auto &C_out_cache = data_cache_j;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                                               auto &B_cache = cache_one;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                                               auto &C_out_cache = cache_two;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
 
                                                // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                                               for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                               for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                    // load data into local memory
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                       const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                       // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                       constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                       constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                       const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));  // current work-item in work-group x-dimension
 
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                           const std::size_t global_i = row_offset_ + i_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                           // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                           const auto global_i_idx_linear = device_row_offset_ + i_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                                                            // store the values in the local memory
-                                                           B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                                                           C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };
+                                                           B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
+                                                           C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };                                                                             // SoA
                                                        }
                                                    });
 
-                                                   // calculate intermediate results and store them in shared memory
+                                                   // calculate intermediate results and store them in local memory
                                                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                           // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                            const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                            const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
@@ -209,18 +259,22 @@ class device_kernel_assembly_symm {
                                                        });
                                                    }
 
-                                                   // add intermediate cached results to C
+                                                   // atomically add the intermediate cached results to the C matrix
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                       const std::size_t threadIdx_y = idx.get_local_id(group, 1);
+                                                       // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                       constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                       const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));  // current work-item in work-group y-dimension
 
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                           const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal);
-                                                           detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1];
+                                                           // calculate the indices to access the global data
+                                                           const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast<std::size_t>(internal);
+
+                                                           detail::atomic_op<real_type>{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1];  // SoA
                                                        }
                                                    });
                                                }
@@ -230,48 +284,53 @@ class device_kernel_assembly_symm {
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
                                                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                                       const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal_i);
-                                                       const auto global_j = row_offset_ + j(idx) + static_cast<std::size_t>(internal_j);
+                                                       // calculate the indices to access the global data
+                                                       const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast<std::size_t>(internal_i);
+                                                       const auto global_j_idx = device_row_offset_ + j_idx(idx) + static_cast<std::size_t>(internal_j);
 
-                                                       if (global_i == global_j) {
+                                                       if (global_i_idx == global_j_idx) {
                                                            temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
                                                        }
                                                    }
                                                }
                                            });
 
-                                           // calculate C += alpha * temp * B for the LOWER triangular matrix
+                                           //*************************************************************************//
+                                           //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+                                           //*************************************************************************//
                                            {
-                                               // allocate shared memory
-                                               auto &B_cache = data_cache_i;      // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-                                               auto &C_out_cache = data_cache_j;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                                               // rename local memory
+                                               auto &B_cache = cache_one;      // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                                               auto &C_out_cache = cache_two;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
 
                                                // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                                               for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                               for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                       const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                       // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                       constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                       constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                       const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                       const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));  // current work-item in work-group x-dimension
 
                                                        // load data into local memory
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                           const auto global_j = row_offset_ + j_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                           // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                           const auto global_j_idx_linear = device_row_offset_ + j_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                           // store the values in the shared memory
-                                                           B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
+                                                           // store the values in the local memory
+                                                           B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
                                                            C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                                                        }
                                                    });
 
-                                                   // implicit group barrier
-
-                                                   // calculate intermediate results and store them in shared memory
+                                                   // calculate intermediate results and store them in local memory
                                                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                           // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                            const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                            const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
@@ -282,26 +341,26 @@ class device_kernel_assembly_symm {
                                                                }
                                                            }
                                                        });
-
-                                                       // implicit group barrier
                                                    }
 
-                                                   // add intermediate cached results to C
+                                                   // atomically add the intermediate cached results to the C matrix
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                       const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                       // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                       constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                       const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                       const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));  // current work-item in work-group x-dimension
 
                                                        for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                           const auto global_i = row_offset_ + i(idx) + static_cast<std::size_t>(internal);
-                                                           detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
+                                                           // calculate the indices to access the global data
+                                                           const auto global_i_idx = device_row_offset_ + i_idx(idx) + static_cast<std::size_t>(internal);
+
+                                                           detail::atomic_op<real_type>{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];  // SoA
                                                        }
                                                    });
-
-                                                   // implicit group barrier
                                                }
                                            }
                                        }
@@ -312,10 +371,10 @@ class device_kernel_assembly_symm {
     /// @cond Doxygen_suppress
     const real_type alpha_;
     const real_type *q_;
-    const real_type *data_d_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type QA_cost_;
     const real_type cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
index 2e6ea3f4f..509e6cb25 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
@@ -15,8 +15,10 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
@@ -28,21 +30,25 @@ namespace plssvm::sycl::detail::work_group {
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
      * @param[in] alpha the scalar alpha value
      * @param[in] q the vector used in the dimensional reduction
-     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] data the data points to calculate the implicit kernel matrix from
      * @param[in] num_rows the total number of data points (= total number of rows)
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @param[in] cost the cost factor the diagonal is scaled with
@@ -53,15 +59,15 @@ class device_kernel_assembly_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        data_cache_i_{ ::sycl::range<1>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-        data_cache_j_{ ::sycl::range<1>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+    device_kernel_assembly_symm(::sycl::handler &cgh, const real_type alpha, const real_type *q, const real_type *data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, const real_type *B, real_type *C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        cache_one_{ ::sycl::range<1>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        cache_two_{ ::sycl::range<1>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE) * static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
         alpha_{ alpha },
         q_{ q },
-        data_d_{ data_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         QA_cost_{ QA_cost },
         cost_{ cost },
@@ -70,7 +76,7 @@ class device_kernel_assembly_symm {
         num_classes_{ num_classes },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -82,47 +88,72 @@ class device_kernel_assembly_symm {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // calculate the indices used in the current work-item
-        const auto i = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto i_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto j = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto j_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto i_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_rows - device_row_offset
+        const auto j_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // device_num_rows
 
         // only calculate the upper triangular matrix -> can't use get_local_id() since all work-items in a work-group must progress further
         if (blockIdx_y >= blockIdx_x) {
             // create a work-item private array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
+            //*************************************************************************//
+            //                   inplace kernel matrix construction                    //
+            //*************************************************************************//
             {
+                // rename cached arrays
+                auto &data_i_cache = cache_one_;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto &data_j_cache = cache_two_;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+
                 // iterate over all features using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) {
+                for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                        const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                         // store the values in the local memory
-                        data_cache_i_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i];
-                        data_cache_j_[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_d_[(dim + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j];
+                        data_i_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                        data_j_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = data_[(feature_block + threadIdx_x) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-                    // perform the feature reduction calculation
-                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
-                                                                                                        data_cache_j_[block_dim * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                   data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_1 * INTERNAL_BLOCK_SIZE + internal_i],
+                                                                                                            data_j_cache[feature * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + local_id_0 * INTERNAL_BLOCK_SIZE + internal_j]);
+                                }
                             }
                         }
                     }
@@ -133,16 +164,18 @@ class device_kernel_assembly_symm {
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
-                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
+                        // apply the final kernel function
+                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp[internal_i][internal_j] += cost_;
                         }
                     } else {
@@ -152,25 +185,28 @@ class device_kernel_assembly_symm {
                 }
             }
 
-            // calculate C += alpha * temp * B for the UPPER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+            //*************************************************************************//
             {
                 // rename cached arrays
-                auto &B_cache = data_cache_i_;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
-                auto &C_out_cache = data_cache_j_;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                auto &B_cache = cache_one_;      // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                auto &C_out_cache = cache_two_;  // [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) {
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const std::size_t global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                         // store the values in the local memory
-                        B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
-                        C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };
+                        B_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
+                        C_out_cache[(internal * THREAD_BLOCK_SIZE + local_id_1) * THREAD_BLOCK_SIZE + local_id_0] = real_type{ 0.0 };                                                                             // SoA
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-                    // calculate intermediate results and store them in shared memory
+                    // calculate intermediate results and store them in local memory
                     for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -181,10 +217,12 @@ class device_kernel_assembly_symm {
                         nd_idx.barrier();  // wait until all work-items performed their part of the calculations
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add the intermediate cached results to the C matrix
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal);
-                        detail::atomic_op<real_type>{ C_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1];
+                        // calculate the indices to access the global data
+                        const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal);
+
+                        detail::atomic_op<real_type>{ C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y] } += C_out_cache[(local_id_0 * INTERNAL_BLOCK_SIZE + internal) * THREAD_BLOCK_SIZE + local_id_1];  // SoA
                     }
                     nd_idx.barrier();  // wai until all work-items updated C with their values
                 }
@@ -193,34 +231,39 @@ class device_kernel_assembly_symm {
             // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
 
-                    if (global_i == global_j) {
+                    // update the diagonal
+                    if (global_i_idx == global_j_idx) {
                         temp[internal_i][internal_j] = real_type{ 0.0 };
                     }
                 }
             }
 
-            // calculate C += alpha * temp * B for the LOWER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+            //*************************************************************************//
             {
                 // rename cached arrays
-                auto &B_cache = data_cache_i_;      // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
-                auto &C_out_cache = data_cache_j_;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto &B_cache = cache_one_;      // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                auto &C_out_cache = cache_two_;  // [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) {
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // store the in the shared memory
-                        B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x];
+                        // store the values in the local memory
+                        B_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x];  // SoA
                         C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                     }
                     nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-                    // calculate intermediate results and store them in shared memory
+                    // calculate intermediate results and store them in local memory
                     for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -231,10 +274,12 @@ class device_kernel_assembly_symm {
                         nd_idx.barrier();  // wait until all work-items performed their part of the calculations
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add the intermediate cached results to the C matrix
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal);
-                        detail::atomic_op<real_type>{ C_[global_i * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal);
+
+                        detail::atomic_op<real_type>{ C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += C_out_cache[local_id_0 * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE + internal * THREAD_BLOCK_SIZE + local_id_1];  // SoA
                     }
                     nd_idx.barrier();  // wait until all threads updated C with their values
                 }
@@ -244,17 +289,17 @@ class device_kernel_assembly_symm {
 
   private:
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 1> data_cache_i_;
+    ::sycl::local_accessor<real_type, 1> cache_one_;
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 1> data_cache_j_;
+    ::sycl::local_accessor<real_type, 1> cache_two_;
 
     /// @cond Doxygen_suppress
     const real_type alpha_;
     const real_type *q_;
-    const real_type *data_d_;
+    const real_type *data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type QA_cost_;
     const real_type cost_;
diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
index 631bf80a1..07d1a79dc 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
@@ -15,8 +15,10 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::item
 
@@ -28,29 +30,34 @@ namespace plssvm::sycl::detail::basic {
 /**
  * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_w_linear {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in,out] w_d the vector to speedup the linear prediction
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] sv_d the support vectors
+     * @param[in,out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
-     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
-     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        w_d_{ w_d },
-        alpha_d_{ alpha_d },
-        sv_d_{ sv_d },
+    device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        w_{ w },
+        alpha_{ alpha },
+        support_vectors_{ support_vectors },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
-        device_specific_num_sv_{ device_specific_num_sv },
-        sv_offset_{ sv_offset },
+        device_num_sv_{ device_num_sv },
+        device_sv_offset_{ device_sv_offset },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset } { }
 
@@ -60,77 +67,106 @@ class device_kernel_w_linear {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto feature_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+        const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t sv = 0; sv < device_specific_num_sv_; ++sv) {
-            // perform the dot product calculation
+        // iterate over all support vectors using blocking
+        for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) {
+            if constexpr (target == target_platform::cpu) {
+                // perform the dot product calculation, the sv is the fastest moving index
+                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        // calculate the indices to access the global data
+                        const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                        const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                            sum += alpha_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv_block + sv + device_sv_offset_] *  // AoS
+                                   support_vectors_[global_feature_idx * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + sv];    // SoA
+                        }
+                        temp[internal_feature][internal_class] += sum;
+                    }
+                }
+            } else {
+                // perform the dot product calculation, the sv is the slowest moving index
+                for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                            const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+
+                            temp[internal_feature][internal_class] += alpha_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv_block + sv + device_sv_offset_] *  // AoS
+                                                                      support_vectors_[global_feature_idx * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + sv];    // SoA
+                        }
+                    }
+                }
+            }
+
+            // update the global w-vector with the locally cached values
             for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                 for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                    // calculate the indices to access the global data
                     const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                    temp[internal_feature][internal_class] += alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_] * sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv];
+                    w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
                 }
             }
         }
-
-        // update global array with local one
-        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-                const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
-
-                w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];
-            }
-        }
     }
 
   private:
     /// @cond Doxygen_suppress
-    real_type *w_d_;
-    const real_type *alpha_d_;
-    const real_type *sv_d_;
+    real_type *w_;
+    const real_type *alpha_;
+    const real_type *support_vectors_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
-    const std::size_t device_specific_num_sv_;
-    const std::size_t sv_offset_;
+    const std::size_t device_num_sv_;
+    const std::size_t device_sv_offset_;
     const std::size_t grid_x_offset_;
     const std::size_t grid_y_offset_;
     /// @endcond
 };
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_predict_linear {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] prediction_d the predicted values
-     * @param[in] w_d the vector to speedup the calculations
-     * @param[in] rho_d the previously learned bias
-     * @param[in] predict_points_d the data points to predict
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_predict_points the number of data points to predict
      * @param[in] num_features the number of features per data point
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        prediction_d_{ prediction_d },
-        w_d_{ w_d },
-        rho_d_{ rho_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        prediction_{ prediction },
+        w_{ w },
+        rho_{ rho },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
@@ -143,46 +179,70 @@ class device_kernel_predict_linear {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+        const auto class_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; ++dim) {
-            // perform the dot product calculation
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
-                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-
-                    temp[internal_pd][internal_class] += w_d_[dim * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] * predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
+        // iterate over all features using blocking
+        for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+            if constexpr (target == target_platform::cpu) {
+                // perform the dot product calculation, the feature is the fastest moving index
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        // calculate the indices to access the global data
+                        const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                        const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            sum += w_[(feature_block + feature) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] *                  // SoA
+                                   predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];  // SoA
+                        }
+                        temp[internal_pp][internal_class] += sum;
+                    }
+                }
+            } else {
+                // perform the dot product calculation, the feature is the slowest moving index
+                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                            temp[internal_pp][internal_class] += w_[(feature_block + feature) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] *                  // SoA
+                                                                 predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];  // SoA
+                        }
+                    }
                 }
             }
         }
 
-        // update global array with local one
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        // update the global array with the local one
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                // calculate the indices to access the global data
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
                 const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
 
-                prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx];
+                prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx];  // AoS
             }
         }
     }
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *w_d_;
-    const real_type *rho_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *w_;
+    const real_type *rho_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_predict_points_;
     const std::size_t num_features_;
@@ -192,21 +252,25 @@ class device_kernel_predict_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @details Uses SYCL's basic data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in] prediction_d the predicted values
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] rho_d the previously learned biases
-     * @param[in] sv_d the support vectors
-     * @param[in] predict_points_d the data points to predict
+     * @param[in] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned biases
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
      * @param[in] num_predict_points the number of data points to predict
@@ -215,19 +279,19 @@ class device_kernel_predict {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        prediction_d_{ prediction_d },
-        alpha_d_{ alpha_d },
-        rho_d_{ rho_d },
-        sv_d_{ sv_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        prediction_{ prediction },
+        alpha_{ alpha },
+        rho_{ rho },
+        support_vectors_{ support_vectors },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -235,54 +299,83 @@ class device_kernel_predict {
      */
     void operator()(::sycl::item<2> idx) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
         // calculate the indices used in the current work-item
-        const std::size_t pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE) * INTERNAL_BLOCK_SIZE_uz;
+        const auto pp_idx = (idx.get_id(1) + grid_x_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+        const auto sv_idx = (idx.get_id(0) + grid_y_offset_ * THREAD_BLOCK_SIZE_uz) * INTERNAL_BLOCK_SIZE_uz;  // num_support_vectors
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; ++dim) {
-            // perform the feature reduction calculation
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
-                    const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
-
-                    temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(sv_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx],
-                                                                                              predict_points_d_[dim * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]);
+        // iterate over all features using blocking
+        for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+            if constexpr (target == target_platform::cpu) {
+                // perform the feature reduction calculation, the feature is the fastest moving index
+                for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                        // calculate the indices to access the global data
+                        const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                        const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+
+                        real_type sum{ 0.0 };
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            sum += detail::feature_reduce<kernel_function>(support_vectors_[(feature_block + feature) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx],              // SoA
+                                                                           predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]);  // SoA
+                        }
+                        temp[internal_pp][internal_sv] += sum;
+                    }
+                }
+            } else {
+                // perform the feature reduction calculation, the feature is the slowest moving index
+                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                            // calculate the indices to access the global data
+                            const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                            const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+
+                            temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(support_vectors_[(feature_block + feature) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx],              // SoA
+                                                                                                      predict_points_[(feature_block + feature) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx]);  // SoA
+                        }
+                    }
                 }
             }
         }
 
         // update temp using the respective kernel function
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter_);
+                temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter_);
             }
         }
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_classes_; ++dim) {
+        // iterate over all classes using blocking
+        for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
             if (sv_idx == 0) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
-                    detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } += -rho_d_[dim];
+                for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        // calculate the index to access the global data
+                        const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+
+                        detail::atomic_op<real_type>{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } += -rho_[class_block + class_idx];
+                    }
                 }
             }
 
-            // calculate intermediate results and store them in local memory
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            // atomically add the results to the prediction
+            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
                     const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                    detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim] } +=
-                        temp[internal_pd][internal_sv] * alpha_d_[dim * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                        detail::atomic_op<real_type>{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + class_idx] } +=
+                            temp[internal_pp][internal_sv] * alpha_[(class_block + class_idx) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    }
                 }
             }
         }
@@ -290,11 +383,11 @@ class device_kernel_predict {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *alpha_d_;
-    const real_type *rho_d_;
-    const real_type *sv_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *alpha_;
+    const real_type *rho_;
+    const real_type *support_vectors_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
     const std::size_t num_predict_points_;
diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
index dedfe609e..1bb93cc3c 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
@@ -15,8 +15,10 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::group, sycl::private_memory, sycl::h_item
 
@@ -28,29 +30,34 @@ namespace plssvm::sycl::detail::hierarchical {
 /**
  * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_w_linear {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in,out] w_d the vector to speedup the linear prediction
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] sv_d the support vectors
+     * @param[in,out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
-     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
-     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        w_d_{ w_d },
-        alpha_d_{ alpha_d },
-        sv_d_{ sv_d },
+    device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        w_{ w },
+        alpha_{ alpha },
+        support_vectors_{ support_vectors },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
-        device_specific_num_sv_{ device_specific_num_sv },
-        sv_offset_{ sv_offset },
+        device_num_sv_{ device_num_sv },
+        device_sv_offset_{ device_sv_offset },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset } { }
 
@@ -59,36 +66,15 @@ class device_kernel_w_linear {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> feature_idx{ group };
-        ::sycl::private_memory<std::size_t, 2> feature_idx_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> class_idx{ group };
-        ::sycl::private_memory<std::size_t, 2> class_idx_linear{ group };
+        // create two local memory arrays used for caching
+        real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
-        // initialize private and local variables
+        // initialize private temp matrix to zero
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
-            // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                     temp(idx)[internal_i][internal_j] = real_type{ 0.0 };
@@ -99,23 +85,36 @@ class device_kernel_w_linear {
         // implicit group barrier
 
         // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) {
+        for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
             // load data into local memory
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                const std::size_t threadIdx_x = idx.get_local_id(0);
+                // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+                const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_features
+                const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;    // num_classes
 
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_feature_idx = feature_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x];  // SoA
-                    data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x];      // AoS
+                    feature_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x];  // SoA
+                    alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x];   // AoS
                 }
             });
 
@@ -123,13 +122,28 @@ class device_kernel_w_linear {
 
             // perform the dot product calculation
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the sv is the fastest moving index
                     for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                            temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                            real_type sum{ 0.0 };
+                            for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                                sum += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                            }
+                            temp(idx)[internal_feature][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the sv is the slowest moving index
+                    for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp(idx)[internal_feature][internal_class] += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                            }
                         }
                     }
                 }
@@ -138,16 +152,30 @@ class device_kernel_w_linear {
             // implicit group barrier
         }
 
-        // update global array with local one
+        // update the global w-vector with the locally cached values
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices used in the current work-item
+            const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+            const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
 
             for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                 for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
-                    const auto global_feature_idx = feature_idx(idx) + static_cast<std::size_t>(internal_feature);
+                    // calculate the indices to access the global data
+                    const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                    w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class];
+                    w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class];  // SoA
                 }
             }
         });
@@ -155,41 +183,46 @@ class device_kernel_w_linear {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *w_d_;
-    const real_type *alpha_d_;
-    const real_type *sv_d_;
+    real_type *w_;
+    const real_type *alpha_;
+    const real_type *support_vectors_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
-    const std::size_t device_specific_num_sv_;
-    const std::size_t sv_offset_;
+    const std::size_t device_num_sv_;
+    const std::size_t device_sv_offset_;
     const std::size_t grid_x_offset_;
     const std::size_t grid_y_offset_;
     /// @endcond
 };
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_predict_linear {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] prediction_d the predicted values
-     * @param[in] w_d the vector to speedup the calculations
-     * @param[in] rho_d the previously learned bias
-     * @param[in] predict_points_d the data points to predict
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_predict_points the number of data points to predict
      * @param[in] num_features the number of features per data point
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        prediction_d_{ prediction_d },
-        w_d_{ w_d },
-        rho_d_{ rho_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        prediction_{ prediction },
+        w_{ w },
+        rho_{ rho },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
@@ -201,35 +234,15 @@ class device_kernel_predict_linear {
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_w[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> pp_idx{ group };
-        ::sycl::private_memory<std::size_t, 2> pp_idx_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> class_idx{ group };
-        ::sycl::private_memory<std::size_t, 2> class_idx_linear{ group };
+        // create two local memory arrays used for caching
+        real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
-        // initialize private and local variables
+        // initialize private variable
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_x = idx.get_local_id(0);       // current thread in block x-dimension
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
             // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -240,25 +253,38 @@ class device_kernel_predict_linear {
 
         // implicit group barrier
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                const std::size_t threadIdx_x = idx.get_local_id(0);
+                // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;     // num_predict_points
+                const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_classes
 
-                // load data into shared memory
+                // load data into local memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                     // store the values in the local memory
-                    data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                    data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
+                    pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                    w_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
                 }
             });
 
@@ -266,13 +292,28 @@ class device_kernel_predict_linear {
 
             // perform the dot product calculation
             group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                // cast values to 32-bit unsigned int values to prevent implicit conversions
                 const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                 const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                            temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
+                            real_type sum{ 0.0 };
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                sum += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                            }
+                            temp(idx)[internal_pp][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the feature is the slowest moving index
+                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp(idx)[internal_pp][internal_class] += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                            }
                         }
                     }
                 }
@@ -281,16 +322,30 @@ class device_kernel_predict_linear {
             // implicit group barrier
         }
 
-        // update global array with local one
+        // update the global array with the local one
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+            constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+            constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+            const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+            const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+            const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+            const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+            const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+            const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+            // calculate the indices used in the current work-item
+            const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+            const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
-                    const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal_pd);
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                    prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx];
+                    prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pp][internal_class] - rho_[global_class_idx];  // AoS
                 }
             }
         });
@@ -298,10 +353,10 @@ class device_kernel_predict_linear {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *w_d_;
-    const real_type *rho_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *w_;
+    const real_type *rho_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_predict_points_;
     const std::size_t num_features_;
@@ -311,21 +366,25 @@ class device_kernel_predict_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @details Uses SYCL's hierarchical data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in] prediction_d the predicted values
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] rho_d the previously learned biases
-     * @param[in] sv_d the support vectors
-     * @param[in] predict_points_d the data points to predict
+     * @param[in] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned biases
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
      * @param[in] num_predict_points the number of data points to predict
@@ -334,51 +393,34 @@ class device_kernel_predict {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        prediction_d_{ prediction_d },
-        alpha_d_{ alpha_d },
-        rho_d_{ rho_d },
-        sv_d_{ sv_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        prediction_{ prediction },
+        alpha_{ alpha },
+        rho_{ rho },
+        support_vectors_{ support_vectors },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
      * @param[in] group indices representing the current point in the execution space
      */
     void operator()(::sycl::group<2> group) const {
-        // allocate shared memory
-        real_type data_cache_pp[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-        real_type data_cache_sv[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-        // calculate the indices used in the current work-item
-        ::sycl::private_memory<std::size_t, 2> pp_idx{ group };
-        ::sycl::private_memory<std::size_t, 2> pp_idx_linear{ group };
-        ::sycl::private_memory<std::size_t, 2> sv_idx_linear{ group };
+        // create two local memory arrays used for caching
+        real_type cache_one[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+        real_type cache_two[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
+        // create a private memory array used for internal caching
         ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> temp{ group };
 
-        // initialize private and local variables
+        // initialize private variable
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            const std::size_t threadIdx_y = idx.get_local_id(1);       // current thread in block y-dimension
-            const std::size_t blockDim_x = idx.get_local_range(0);     // number of threads in block x-dimension
-            const std::size_t blockDim_y = idx.get_local_range(1);     // number of threads in block y-dimension
-            const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-            const std::size_t blockIdx_y = group[1] + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-            const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-            // indices
-            pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-            pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-            sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-
             // initialize private temp matrix to zero
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
@@ -390,25 +432,42 @@ class device_kernel_predict {
         // implicit group barrier
 
         {
+            // rename cached arrays -> not possible due to an AdaptiveCpp runtime exception
+            // auto &pp_cache = cache_one;
+            // auto &sv_cache = cache_two;
+
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    const std::size_t threadIdx_x = idx.get_local_id(0);
+                    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                    const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                    const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                    const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                    const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                    const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+                    const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                    // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                    const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_predict_points
+                    const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
 
                     // load data into local memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                        const auto global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // store the values in the shared memory
-                        data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                        data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                        // store the values in the local memory
+                        cache_one[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];
+                        cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];
                     }
                 });
 
@@ -416,14 +475,30 @@ class device_kernel_predict {
 
                 // perform the feature reduction calculation
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                temp(idx)[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                               data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]);
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(cache_two[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                   cache_one[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                                }
+                                temp(idx)[internal_pp][internal_sv] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                    temp(idx)[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(cache_two[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                                   cache_one[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                                }
                             }
                         }
                     }
@@ -435,9 +510,9 @@ class device_kernel_predict {
 
         // update temp using the respective kernel function
         group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
-            for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_);
+                    temp(idx)[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_pp][internal_sv], kernel_function_parameter_);
                 }
             }
         });
@@ -445,33 +520,42 @@ class device_kernel_predict {
         // implicit group barrier
 
         {
-            // rename cached arrays -> can't rename the arrays due to AdaptiveCpp runtime exception
-            // auto &alpha_cache = data_cache_pp;
-            // auto &out_cache = data_cache_sv;
+            // rename cached arrays -> not possible due to an AdaptiveCpp runtime exception
+            // auto &alpha_cache = cache_one;
+            // auto &out_cache = cache_two;
 
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+            // iterate over all classes using blocking to be able to cache them for faster memory accesses
+            for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                 // load data into local memory
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-                    const std::size_t threadIdx_x = idx.get_local_id(0);
+                    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                    const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                    const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                    const auto blockDim_x = static_cast<std::size_t>(idx.get_local_range(0));     // number of work-items in work-group x-dimension
+                    const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
 
-                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                    const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
 
-                        data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                        // store the values in the local memory
+                        cache_one[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                         // the bias (rho) must only be applied once for all support vectors
                         if (blockIdx_x == std::size_t{ 0 }) {
-                            data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
+                            cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x];
                         } else {
-                            data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
+                            cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                         }
                     }
                 });
@@ -481,13 +565,14 @@ class device_kernel_predict {
                 // calculate intermediate results and store them in local memory
                 for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                        // cast values to 32-bit unsigned int values to prevent implicit conversions
                         const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                         const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                data_cache_sv[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
-                                    temp(idx)[internal_pd][internal_sv] * data_cache_pp[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                                cache_two[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] +=
+                                    temp(idx)[internal_pp][internal_sv] * cache_one[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
                             }
                         }
                     });
@@ -495,19 +580,29 @@ class device_kernel_predict {
                     // implicit group barrier
                 }
 
-                // add intermediate cached results to prediction_d
+                // atomically add the intermediate cached results to the prediction
                 group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
+                    // cast values to 32-bit unsigned int values to prevent implicit conversions
                     const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(0));
                     const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(1));
 
-                    const std::size_t threadIdx_x = idx.get_local_id(0);
+                    // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                    const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(0));       // current work-item in work-group x-dimension
+                    const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(1));       // current work-item in work-group y-dimension
+                    const auto blockDim_y = static_cast<std::size_t>(idx.get_local_range(1));     // number of work-items in work-group y-dimension
+                    const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-                    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                    // calculate the indices used in the current thread
+                    const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
 
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal);
+                        // calculate the indices to access the global data
+                        const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                        detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
+                        detail::atomic_op<real_type>{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += cache_two[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
                     }
                 });
 
@@ -518,11 +613,11 @@ class device_kernel_predict {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *alpha_d_;
-    const real_type *rho_d_;
-    const real_type *sv_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *alpha_;
+    const real_type *rho_;
+    const real_type *support_vectors_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
     const std::size_t num_predict_points_;
diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
index e6d56ec56..a62418057 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
@@ -15,8 +15,10 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::memory_environment, sycl::require_local_mem, sycl::require_private_mem, sycl::distribute_items_and_wait, sycl::s_item
 
@@ -28,29 +30,34 @@ namespace plssvm::sycl::detail::scoped {
 /**
  * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_w_linear {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in,out] w_d the vector to speedup the linear prediction
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] sv_d the support vectors
+     * @param[in,out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
-     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
-     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_w_linear(real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        w_d_{ w_d },
-        alpha_d_{ alpha_d },
-        sv_d_{ sv_d },
+    device_kernel_w_linear(real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        w_{ w },
+        alpha_{ alpha },
+        support_vectors_{ support_vectors },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
-        device_specific_num_sv_{ device_specific_num_sv },
-        sv_offset_{ sv_offset },
+        device_num_sv_{ device_num_sv },
+        device_sv_offset_{ device_sv_offset },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset } { }
 
@@ -62,78 +69,101 @@ class device_kernel_w_linear {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &data_cache_feature, auto &data_cache_alpha, auto &feature_idx, auto &feature_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           feature_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           feature_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // feature_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // alpha_cache
 
+                                   // create a private memory array used for internal caching
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &feature_cache, auto &alpha_cache, auto &temp) {
                                        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-                                       for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) {
+                                       for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE) {
                                            // load data into local memory
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                               // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                               constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                               constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                               const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                               const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                               const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                               const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                               const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                               // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+                                               const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_features
+                                               const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;    // num_classes
 
                                                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                   const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                   const auto global_feature_idx = feature_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                   const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                                   data_cache_feature[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x];  // SoA
-                                                   data_cache_alpha[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x];      // AoS
+                                                   feature_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x];  // SoA
+                                                   alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x];   // AoS
                                                }
                                            });
 
                                            // perform the dot product calculation
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                                               if constexpr (target == target_platform::cpu) {
+                                                   // perform the dot product calculation, the sv is the fastest moving index
                                                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                                                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                                                           temp(idx)[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                                                           real_type sum{ 0.0 };
+                                                           for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                                                               sum += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                                                           }
+                                                           temp(idx)[internal_feature][internal_class] += sum;
+                                                       }
+                                                   }
+                                               } else {
+                                                   // perform the dot product calculation, the sv is the fastest moving index
+                                                   for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                                                       for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                                                           for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                                               temp(idx)[internal_feature][internal_class] += alpha_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                                                           }
                                                        }
                                                    }
                                                }
                                            });
                                        }
 
-                                       // update global array with local one
+                                       // update the global w-vector with the locally cached values
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices used in the current thread
+                                           const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+                                           const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
 
                                            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                                                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                                                   const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
-                                                   const auto global_feature_idx = feature_idx(idx) + static_cast<std::size_t>(internal_feature);
+                                                   // calculate the indices to access the global data
+                                                   const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                                                   const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                                                   w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class];
+                                                   w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_feature][internal_class];  // SoA
                                                }
                                            }
                                        });
@@ -142,41 +172,46 @@ class device_kernel_w_linear {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *w_d_;
-    const real_type *alpha_d_;
-    const real_type *sv_d_;
+    real_type *w_;
+    const real_type *alpha_;
+    const real_type *support_vectors_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
-    const std::size_t device_specific_num_sv_;
-    const std::size_t sv_offset_;
+    const std::size_t device_num_sv_;
+    const std::size_t device_sv_offset_;
     const std::size_t grid_x_offset_;
     const std::size_t grid_y_offset_;
     /// @endcond
 };
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_predict_linear {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[out] prediction_d the predicted values
-     * @param[in] w_d the vector to speedup the calculations
-     * @param[in] rho_d the previously learned bias
-     * @param[in] predict_points_d the data points to predict
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_predict_points the number of data points to predict
      * @param[in] num_features the number of features per data point
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_predict_linear(real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        prediction_d_{ prediction_d },
-        w_d_{ w_d },
-        rho_d_{ rho_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict_linear(real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        prediction_{ prediction },
+        w_{ w },
+        rho_{ rho },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
@@ -191,79 +226,102 @@ class device_kernel_predict_linear {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &data_cache_pp, auto &data_cache_w, auto &pp_idx, auto &pp_idx_linear, auto &class_idx, auto &class_idx_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_x = idx.get_local_id(group, 0);       // current thread in block x-dimension
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           class_idx(idx) = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-                                           class_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // pp_cache
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // w_cache
 
-                                       // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-                                       for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                   // create a private memory array used for internal caching
+                                   ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
+                                   [&](auto &pp_cache, auto &w_cache, auto &temp) {
+                                       // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                       for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                               // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                               constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                               constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                               constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                               const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                               const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                               const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                               const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                               const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                               const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
 
-                                               // load data into shared memory
+                                               // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                                               const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;     // num_predict_points
+                                               const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_classes
+
+                                               // load data into local memory
                                                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                   const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                   const auto global_class_idx = class_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                   const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
                                                    // store the values in the local memory
-                                                   data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                                                   data_cache_w[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
+                                                   pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                                                   w_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
                                                }
                                            });
 
                                            // perform the dot product calculation
                                            ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                               // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                                                   for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                               if constexpr (target == target_platform::cpu) {
+                                                   // perform the dot product calculation, the feature is the fastest moving index
+                                                   for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                                                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                                                           temp(idx)[internal_pd][internal_class] += data_cache_w[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
+                                                           real_type sum{ 0.0 };
+                                                           for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                               sum += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                                                           }
+                                                           temp(idx)[internal_pp][internal_class] += sum;
+                                                       }
+                                                   }
+                                               } else {
+                                                   // perform the dot product calculation, the feature is the slowest moving index
+                                                   for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                       for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                                                           for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                                               temp(idx)[internal_pp][internal_class] += w_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                                                           }
                                                        }
                                                    }
                                                }
                                            });
                                        }
 
-                                       // update global array with local one
+                                       // update the global array with the local one
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-                                           for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                           // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                           constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                           constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                           const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                           const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                           const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                           const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                           const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                           const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
+
+                                           // calculate the indices used in the current work-item
+                                           const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+                                           const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+                                           for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                                                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                                                   const auto global_class_idx = class_idx(idx) + static_cast<std::size_t>(internal_class);
-                                                   const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal_pd);
+                                                   // calculate the indices to access the global data
+                                                   const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
+                                                   const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                                                   prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pd][internal_class] - rho_d_[global_class_idx];
+                                                   prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp(idx)[internal_pp][internal_class] - rho_[global_class_idx];  // AoS
                                                }
                                            }
                                        });
@@ -272,10 +330,10 @@ class device_kernel_predict_linear {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *w_d_;
-    const real_type *rho_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *w_;
+    const real_type *rho_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_predict_points_;
     const std::size_t num_features_;
@@ -285,21 +343,25 @@ class device_kernel_predict_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @details Uses AdaptiveCpp's scoped parallelism.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in] prediction_d the predicted values
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] rho_d the previously learned biases
-     * @param[in] sv_d the support vectors
-     * @param[in] predict_points_d the data points to predict
+     * @param[in] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned biases
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
      * @param[in] num_predict_points the number of data points to predict
@@ -308,19 +370,19 @@ class device_kernel_predict {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_predict(real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        prediction_d_{ prediction_d },
-        alpha_d_{ alpha_d },
-        rho_d_{ rho_d },
-        sv_d_{ sv_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict(real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        prediction_{ prediction },
+        alpha_{ alpha },
+        rho_{ rho },
+        support_vectors_{ support_vectors },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -330,102 +392,130 @@ class device_kernel_predict {
     template <typename T>
     void operator()(T group) const {
         ::sycl::memory_environment(group,
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
-                                   ::sycl::require_private_mem<std::size_t>(),
+                                   // create two local memory arrays used for caching
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // cache_one
+                                   ::sycl::require_local_mem<real_type[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]>(),  // cache_two
+
+                                   // create a private memory array used for internal caching
                                    ::sycl::require_private_mem<std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE>>({}),
-                                   [&](auto &data_cache_pp, auto &data_cache_sv, auto &pp_idx, auto &pp_idx_linear, auto &sv_idx_linear, auto &temp) {
-                                       // initialize private and local variables
-                                       ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           const std::size_t threadIdx_y = idx.get_local_id(group, 1);       // current thread in block y-dimension
-                                           const std::size_t blockDim_x = group.get_logical_local_range(0);  // number of threads in block x-dimension
-                                           const std::size_t blockDim_y = group.get_logical_local_range(1);  // number of threads in block y-dimension
-                                           const std::size_t blockIdx_x = group[0] + grid_x_offset_;         // current block in grid x-dimension + offsets if the grid size would be too large
-                                           const std::size_t blockIdx_y = group[1] + grid_y_offset_;         // current block in grid y-dimension + offsets if the grid size would be too large
-
-                                           const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-
-                                           // indices
-                                           pp_idx(idx) = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-                                           pp_idx_linear(idx) = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                           sv_idx_linear(idx) = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-                                       });
+                                   [&](auto &cache_one, auto &cache_two, auto &temp) {
+                                       {
+                                           // rename cached arrays
+                                           auto &pp_cache = cache_one;
+                                           auto &sv_cache = cache_two;
 
-                                       // iterate over all features using blocking to be able to cache them for faster memory accesses
-                                       for (std::size_t dim = 0; dim < num_features_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
-                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
-                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+                                           // iterate over all features using blocking to be able to cache them for faster memory accesses
+                                           for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
+                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                               const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                   // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                   constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                                   constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                   constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                               const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                               const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                   const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                                   const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                                   const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                                   const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                                   const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
+                                                   const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
 
-                                               // load data into local memory
-                                               for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                   const auto global_pp_idx = pp_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                                                   const auto global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                                                   const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_predict_points
+                                                   const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
 
-                                                   // store the values in the shared memory
-                                                   data_cache_pp[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                                                   data_cache_sv[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
-                                               }
-                                           });
+                                                   // load data into local memory
+                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                       // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                       const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                       const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                                           // perform the feature reduction calculation
-                                           ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                               const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
-                                               const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+                                                       // store the values in the local memory
+                                                       pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];
+                                                       sv_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];
+                                                   }
+                                               });
 
-                                               for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                                                   for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                                                       for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                                           temp(idx)[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                                                          data_cache_pp[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]);
+                                               // perform the feature reduction calculation
+                                               ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
+                                                   const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
+                                                   const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
+
+                                                   if constexpr (target == target_platform::cpu) {
+                                                       // perform the feature reduction calculation, the feature is the fastest moving index
+                                                       for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                                                           for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                                               real_type sum{ 0.0 };
+                                                               for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                                   sum += detail::feature_reduce<kernel_function>(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                                  pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                                                               }
+                                                               temp(idx)[internal_pp][internal_sv] += sum;
+                                                           }
+                                                       }
+                                                   } else {
+                                                       // perform the feature reduction calculation, the feature is the slowest moving index
+                                                       for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                                           for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                                                               for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                                                   temp(idx)[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                                                                  pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                                                               }
+                                                           }
                                                        }
                                                    }
-                                               }
-                                           });
+                                               });
+                                           }
                                        }
 
                                        // update temp using the respective kernel function
                                        ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
-                                           for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                           for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                                                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                                   temp(idx)[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_pd][internal_sv], kernel_function_parameter_);
+                                                   temp(idx)[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp(idx)[internal_pp][internal_sv], kernel_function_parameter_);
                                                }
                                            }
                                        });
 
                                        {
                                            // rename cached arrays
-                                           auto &alpha_cache = data_cache_pp;
-                                           auto &out_cache = data_cache_sv;
+                                           auto &alpha_cache = cache_one;
+                                           auto &out_cache = cache_two;
 
-                                           // iterate over all features using blocking to be able to cache them for faster memory accesses
-                                           for (std::size_t dim = 0; dim < num_classes_; dim += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
+                                           // iterate over all classes using blocking to be able to cache them for faster memory accesses
+                                           for (std::size_t class_block = 0; class_block < num_classes_; class_block += static_cast<std::size_t>(THREAD_BLOCK_SIZE)) {
                                                // load data into local memory
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   const std::size_t blockIdx_x = group[0] + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                   // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                   constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                                   constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+                                                   constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                   const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                                   const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                                   const auto blockDim_x = static_cast<std::size_t>(group.get_logical_local_range(0));  // number of work-items in work-group x-dimension
+                                                   const auto blockIdx_x = static_cast<std::size_t>(group[0]) + grid_x_offset_;         // current work-group in global range x-dimension + offsets if the global range is too large
 
-                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                       const std::size_t global_sv_idx = sv_idx_linear(idx) + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                                                   // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                                                   const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
 
-                                                       alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                                                   for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                                                       // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                                                       const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                                                       // store the values in the local memory
+                                                       alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                                                        // the bias (rho) must only be applied once for all support vectors
                                                        if (blockIdx_x == std::size_t{ 0 }) {
-                                                           out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
+                                                           out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x];
                                                        } else {
                                                            out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                                                        }
@@ -435,33 +525,42 @@ class device_kernel_predict {
                                                // calculate intermediate results and store them in local memory
                                                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                                                    ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                       // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                        const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                        const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                       for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                                                       for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                                                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                                                               out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
-                                                                   temp(idx)[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                                                               out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] +=
+                                                                   temp(idx)[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
                                                            }
                                                        }
                                                    });
                                                }
 
-                                               // add intermediate cached results to prediction_d
+                                               // atomically add the intermediate cached results to the prediction
                                                ::sycl::distribute_items_and_wait(group, [&](::sycl::s_item<2> idx) {
+                                                   // cast values to 32-bit unsigned int values to prevent implicit conversions
                                                    const auto local_id_0 = static_cast<unsigned>(idx.get_local_id(group, 0));
                                                    const auto local_id_1 = static_cast<unsigned>(idx.get_local_id(group, 1));
 
-                                                   const std::size_t threadIdx_x = idx.get_local_id(group, 0);
+                                                   // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+                                                   constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+                                                   constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+                                                   const auto threadIdx_x = static_cast<std::size_t>(idx.get_local_id(group, 0));       // current work-item in work-group x-dimension
+                                                   const auto threadIdx_y = static_cast<std::size_t>(idx.get_local_id(group, 1));       // current work-item in work-group y-dimension
+                                                   const auto blockDim_y = static_cast<std::size_t>(group.get_logical_local_range(1));  // number of work-items in work-group y-dimension
+                                                   const auto blockIdx_y = static_cast<std::size_t>(group[1]) + grid_y_offset_;         // current work-group in global range y-dimension + offsets if the global range is too large
 
-                                                   const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-                                                   const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+                                                   // calculate the indices used in the current thread
+                                                   const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
 
                                                    for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                                                       const auto global_pp_idx = pp_idx(idx) + static_cast<std::size_t>(internal);
+                                                       // calculate the indices to access the global data
+                                                       const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                                                       detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
-                                                       detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_uz] } += out_cache[local_id_0 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_1];
+                                                       detail::atomic_op<real_type>{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
                                                    }
                                                });
                                            }
@@ -471,11 +570,11 @@ class device_kernel_predict {
 
   private:
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *alpha_d_;
-    const real_type *rho_d_;
-    const real_type *sv_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *alpha_;
+    const real_type *rho_;
+    const real_type *support_vectors_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
     const std::size_t num_predict_points_;
diff --git a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
index 6612a10d8..25bec3f13 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
@@ -15,8 +15,10 @@
 
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::handler, sycl::range, sycl::nd_item, sycl::local_accessor
 
@@ -28,32 +30,37 @@ namespace plssvm::sycl::detail::work_group {
 /**
  * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_w_linear {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
-     * @param[in,out] w_d the vector to speedup the linear prediction
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] sv_d the support vectors
+     * @param[in,out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
-     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
-     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_w_linear(::sycl::handler &cgh, real_type *w_d, const real_type *alpha_d, const real_type *sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        data_cache_feature_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        data_cache_alpha_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        w_d_{ w_d },
-        alpha_d_{ alpha_d },
-        sv_d_{ sv_d },
+    device_kernel_w_linear(::sycl::handler &cgh, real_type *w, const real_type *alpha, const real_type *support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        feature_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        alpha_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        w_{ w },
+        alpha_{ alpha },
+        support_vectors_{ support_vectors },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
-        device_specific_num_sv_{ device_specific_num_sv },
-        sv_offset_{ sv_offset },
+        device_num_sv_{ device_num_sv },
+        device_sv_offset_{ device_sv_offset },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset } { }
 
@@ -67,104 +74,130 @@ class device_kernel_w_linear {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current work-item
-        const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE) {
-            // load data into local memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                const auto global_feature_idx = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+        {
+            // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+            const auto feature_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_features
+            const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;    // num_classes
 
-                data_cache_feature_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_uz) + sv + threadIdx_x];  // SoA
-                data_cache_alpha_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_uz) + sv + sv_offset_ + threadIdx_x];      // AoS
-            }
-            nd_idx.barrier();  // wait until all work-items loaded their part of the data
+            // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+            for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                        temp[internal_feature][internal_class] += data_cache_alpha_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                    // store the values in the local memory
+                    feature_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_x];  // SoA
+                    alpha_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_x];   // AoS
+                }
+                nd_idx.barrier();  // wait until all work-items loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the sv is the fastest moving index
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            real_type sum{ 0.0 };
+                            for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                                sum += alpha_cache_[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache_[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                            }
+                            temp[internal_feature][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the sv is the slowest moving index
+                    for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp[internal_feature][internal_class] += alpha_cache_[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache_[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_feature];
+                            }
+                        }
                     }
                 }
+                nd_idx.barrier();  // wait until all work-items performed their part of the calculations
             }
-            nd_idx.barrier();  // wait until all work-items performed their part of the calculations
         }
 
-        // update global array with local one
+        // calculate the indices used in the current work-item
+        const auto feature_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+        const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
+
+        // update the global w-vector with the locally cached values
         for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+                // calculate the indices to access the global data
                 const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];
+                w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
             }
         }
     }
 
   private:
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_feature_;
+    ::sycl::local_accessor<real_type, 2> feature_cache_;
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_alpha_;
+    ::sycl::local_accessor<real_type, 2> alpha_cache_;
 
     /// @cond Doxygen_suppress
-    real_type *w_d_;
-    const real_type *alpha_d_;
-    const real_type *sv_d_;
+    real_type *w_;
+    const real_type *alpha_;
+    const real_type *support_vectors_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
-    const std::size_t device_specific_num_sv_;
-    const std::size_t sv_offset_;
+    const std::size_t device_num_sv_;
+    const std::size_t device_sv_offset_;
     const std::size_t grid_x_offset_;
     const std::size_t grid_y_offset_;
     /// @endcond
 };
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  */
+template <target_platform target>
 class device_kernel_predict_linear {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
-     * @param[out] prediction_d the predicted values
-     * @param[in] w_d the vector to speedup the calculations
-     * @param[in] rho_d the previously learned bias
-     * @param[in] predict_points_d the data points to predict
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_predict_points the number of data points to predict
      * @param[in] num_features the number of features per data point
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      */
-    device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction_d, const real_type *w_d, const real_type *rho_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
-        data_cache_pp_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        data_cache_w_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        prediction_d_{ prediction_d },
-        w_d_{ w_d },
-        rho_d_{ rho_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict_linear(::sycl::handler &cgh, real_type *prediction, const real_type *w, const real_type *rho, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset) :
+        pp_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        w_cache_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        prediction_{ prediction },
+        w_{ w },
+        rho_{ rho },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
@@ -181,71 +214,91 @@ class device_kernel_predict_linear {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current work-item
-        const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-        const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;     // num_predict_points
+            const auto class_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_classes
 
-                // store the values in the local memory
-                data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                data_cache_w_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_d_[(dim + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx];
-            }
-            nd_idx.barrier();  // wait until all work-items loaded their part of the data
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the local memory
+                    pp_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                    w_cache_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = w_[(feature_block + threadIdx_x) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+                }
+                nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                        temp[internal_pd][internal_class] += data_cache_w_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd];
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            real_type sum{ 0.0 };
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                sum += w_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                            }
+                            temp[internal_pp][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the feature is the slowest moving index
+                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp[internal_pp][internal_class] += w_cache_[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache_[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp];
+                            }
+                        }
                     }
                 }
+                nd_idx.barrier();  // wait until all work-items performed their part of the calculations
             }
-            nd_idx.barrier();  // wait until all work-items performed their part of the calculations
         }
 
-        // update global array with local one
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        // calculate the indices used in the current work-item
+        const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+        const auto class_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+        // update the global array with the local one
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                // calculate the indices to access the global data
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
                 const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
 
-                prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx];
+                prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx];  // AoS
             }
         }
     }
 
   private:
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_pp_;
+    ::sycl::local_accessor<real_type, 2> pp_cache_;
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_w_;
+    ::sycl::local_accessor<real_type, 2> w_cache_;
 
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *w_d_;
-    const real_type *rho_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *w_;
+    const real_type *rho_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_predict_points_;
     const std::size_t num_features_;
@@ -255,22 +308,26 @@ class device_kernel_predict_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @details Uses SYCL's work-group data parallel kernels.
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `std::tuple`
  */
-template <kernel_function_type kernel_function, typename... Args>
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
+    /// The used SYCL kernel invocation type.
+    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+
     /**
      * @brief Initialize the SYCL kernel function object.
      * @param[in] cgh the SYCL handler used to allocate the local memory
-     * @param[in] prediction_d the predicted values
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] rho_d the previously learned biases
-     * @param[in] sv_d the support vectors
-     * @param[in] predict_points_d the data points to predict
+     * @param[in] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned biases
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
      * @param[in] num_predict_points the number of data points to predict
@@ -279,21 +336,21 @@ class device_kernel_predict {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_predict(::sycl::handler &cgh, real_type *prediction_d, const real_type *alpha_d, const real_type *rho_d, const real_type *sv_d, const real_type *predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
-        data_cache_pp_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        data_cache_sv_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
-        prediction_d_{ prediction_d },
-        alpha_d_{ alpha_d },
-        rho_d_{ rho_d },
-        sv_d_{ sv_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict(::sycl::handler &cgh, real_type *prediction, const real_type *alpha, const real_type *rho, const real_type *support_vectors, const real_type *predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, Args... kernel_function_parameter) :
+        cache_one_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        cache_two_{ ::sycl::range<2>{ static_cast<std::size_t>(THREAD_BLOCK_SIZE), static_cast<std::size_t>(INTERNAL_BLOCK_SIZE) * static_cast<std::size_t>(THREAD_BLOCK_SIZE) }, cgh },
+        prediction_{ prediction },
+        alpha_{ alpha },
+        rho_{ rho },
+        support_vectors_{ support_vectors },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
-        kernel_function_parameter_{ std::make_tuple(std::forward<Args>(kernel_function_parameter)...) } { }
+        kernel_function_parameter_{ std::make_tuple(kernel_function_parameter...) } { }
 
     /**
      * @brief Function call operator overload performing the actual calculation.
@@ -305,44 +362,63 @@ class device_kernel_predict {
         const auto local_id_1 = static_cast<unsigned>(nd_idx.get_local_id(1));
 
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const std::size_t threadIdx_x = nd_idx.get_local_id(0);               // current thread in block x-dimension
-        const std::size_t threadIdx_y = nd_idx.get_local_id(1);               // current thread in block y-dimension
-        const std::size_t blockDim_x = nd_idx.get_local_range(0);             // number of threads in block x-dimension
-        const std::size_t blockDim_y = nd_idx.get_local_range(1);             // number of threads in block y-dimension
-        const std::size_t blockIdx_x = nd_idx.get_group(0) + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const std::size_t blockIdx_y = nd_idx.get_group(1) + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-        const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current work-item
-        const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
-        const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
-        const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;
+        const auto threadIdx_x = static_cast<std::size_t>(nd_idx.get_local_id(0));               // current work-item in work-group x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(nd_idx.get_local_id(1));               // current work-item in work-group y-dimension
+        const auto blockDim_x = static_cast<std::size_t>(nd_idx.get_local_range(0));             // number of work-items in work-group x-dimension
+        const auto blockDim_y = static_cast<std::size_t>(nd_idx.get_local_range(1));             // number of work-items in work-group y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(nd_idx.get_group(0)) + grid_x_offset_;  // current work-group in global range x-dimension + offsets if the global range is too large
+        const auto blockIdx_y = static_cast<std::size_t>(nd_idx.get_group(1)) + grid_y_offset_;  // current work-group in global range y-dimension + offsets if the global range is too large
 
         // create a work-item private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
         {
+            // rename cached arrays
+            auto &pp_cache = cache_one_;
+            auto &sv_cache = cache_two_;
+
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto pp_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_predict_points
+            const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
+
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) {
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                    const auto global_sv_idx = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                    // store the values in the shared memory
-                    data_cache_pp_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_d_[(dim + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx];
-                    data_cache_sv_[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = sv_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    // store the values in the local memory
+                    pp_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = predict_points_[(feature_block + threadIdx_x) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                    sv_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = support_vectors_[(feature_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];             // SoA
                 }
                 nd_idx.barrier();  // wait until all work-items loaded their part of the data
 
-                // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the feature reduction calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv_[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
-                                                                                                      data_cache_pp_[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pd]);
+                            real_type sum{ 0.0 };
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                sum += detail::feature_reduce<kernel_function>(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                               pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                            }
+                            temp[internal_pp][internal_sv] += sum;
+                        }
+                    }
+                } else {
+                    // perform the feature reduction calculation, the feature is the slowest moving index
+                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv],
+                                                                                                          pp_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                            }
                         }
                     }
                 }
@@ -351,28 +427,34 @@ class device_kernel_predict {
         }
 
         // update temp using the respective kernel function
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter_);
+                temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter_);
             }
         }
 
         {
             // rename cached arrays
-            auto &alpha_cache = data_cache_pp_;
-            auto &out_cache = data_cache_sv_;
+            auto &alpha_cache = cache_one_;
+            auto &out_cache = cache_two_;
 
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes_; dim += THREAD_BLOCK_SIZE_uz) {
+            // calculate the indices used in the current work-item
+            const auto pp_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+            // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+            const auto sv_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_y;  // num_support_vectors
+
+            // iterate over all classes using blocking to be able to cache them for faster memory accesses
+            for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const std::size_t global_sv_idx = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-
-                    alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_d_[(dim + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                    // store the values in the local memory
+                    alpha_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = alpha_[(class_block + threadIdx_x) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                     // the bias (rho) must only be applied once for all support vectors
                     if (blockIdx_x == std::size_t{ 0 }) {
-                        out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_d_[dim + threadIdx_x];
+                        out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = -rho_[class_block + threadIdx_x];
                     } else {
                         out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1] = real_type{ 0.0 };
                     }
@@ -381,20 +463,21 @@ class device_kernel_predict {
 
                 // calculate intermediate results and store them in local memory
                 for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
-                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_1] +=
-                                temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
+                            out_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_1] +=
+                                temp[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_0) % THREAD_BLOCK_SIZE][local_id_0 * INTERNAL_BLOCK_SIZE + internal_sv];
                         }
                     }
                     nd_idx.barrier();  // wait until all work-items performed their part of the calculations
                 }
 
-                // add intermediate cached results to prediction_d
+                // atomically add the intermediate cached results to the prediction
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data
                     const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                    detail::atomic_op<real_type>{ prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + dim + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
+                    detail::atomic_op<real_type>{ prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x] } += out_cache[local_id_0][internal * THREAD_BLOCK_SIZE + local_id_1];
                 }
                 nd_idx.barrier();  // wait until all work-items updated their part of the prediction
             }
@@ -403,16 +486,16 @@ class device_kernel_predict {
 
   private:
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_pp_;
+    ::sycl::local_accessor<real_type, 2> cache_one_;
     /// Local memory used for internal memory access optimizations.
-    ::sycl::local_accessor<real_type, 2> data_cache_sv_;
+    ::sycl::local_accessor<real_type, 2> cache_two_;
 
     /// @cond Doxygen_suppress
-    real_type *prediction_d_;
-    const real_type *alpha_d_;
-    const real_type *rho_d_;
-    const real_type *sv_d_;
-    const real_type *predict_points_d_;
+    real_type *prediction_;
+    const real_type *alpha_;
+    const real_type *rho_;
+    const real_type *support_vectors_;
+    const real_type *predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
     const std::size_t num_predict_points_;
diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
index 6f0772db0..c03aa46b0 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
@@ -67,6 +67,144 @@
 #include <variant>    // std::get
 #include <vector>     // std::vector
 
+namespace {
+
+/**
+ * @brief Run the kernel functor on the given device.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam QueueType the type of the SYCL queue to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] device the SYCL queue to run the kernel on
+ * @param[in] partial_grid the number of work-groups in each dimension of the execution grid
+ * @param[in] block the number of work-items in each dimension per work-group
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <typename KernelFunctor, typename QueueType, typename... Args>
+void run_kernel_functor(const QueueType &device, const plssvm::detail::dim_type partial_grid, const plssvm::detail::dim_type block, Args &&...args) {
+    constexpr plssvm::sycl::kernel_invocation_type invocation = KernelFunctor::invocation_type;
+
+    if constexpr (invocation == plssvm::sycl::kernel_invocation_type::basic) {
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::basic>(partial_grid, block),
+                             KernelFunctor{ std::forward<Args>(args)... });
+        });
+    } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::work_group) {
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::work_group>(partial_grid, block),
+                             KernelFunctor{ cgh, std::forward<Args>(args)... });
+        });
+    } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::hierarchical) {
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::hierarchical>(partial_grid, block);
+            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward<Args>(args)... });
+        });
+#else
+        throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+    } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::scoped) {
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::scoped>(partial_grid, block);
+            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward<Args>(args)... });
+        });
+#else
+        throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+#endif
+    } else {
+        static_assert(::plssvm::detail::always_false_v<Args...>, "Unsupported kernel function!");
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam target the target platform to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, plssvm::target_platform target, typename... Args>
+void dispatch_kernel_function_type(const plssvm::parameter &params, Args &&...args) {
+    switch (params.kernel_type) {
+        case plssvm::kernel_function_type::linear:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::linear>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::kernel_function_type::polynomial:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::rbf:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::rbf, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::sigmoid:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::laplacian:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::laplacian, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::chi_squared:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::chi_squared, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform and kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, const plssvm::parameter &params, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::adaptivecpp::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_nvidia>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_amd>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_intel>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::cpu>(params, std::forward<Args>(args)...);
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::adaptivecpp::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_nvidia>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_amd>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_intel>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::cpu>>(std::forward<Args>(args)...);
+            break;
+    }
+}
+
+}  // namespace
+
 namespace plssvm::adaptivecpp {
 
 void csvm::init(const target_platform target) {
@@ -257,273 +395,21 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_assembly<kernel_function_type::linear>{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
                 break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::kernel_invocation_type::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::kernel_invocation_type::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::kernel_invocation_type::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
+            case sycl::kernel_invocation_type::scoped:
+                dispatch_target_platform<sycl::detail::scoped::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
         }
     }
     detail::device_synchronize(device);
@@ -550,37 +436,16 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
             case sycl::kernel_invocation_type::automatic:
                 throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
             case sycl::kernel_invocation_type::basic:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                     sycl::detail::basic::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
+                dispatch_target_platform<sycl::detail::basic::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::work_group:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                     sycl::detail::work_group::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                    cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                break;
+                dispatch_target_platform<sycl::detail::scoped::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
         }
     }
 
@@ -592,37 +457,16 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
                 case sycl::kernel_invocation_type::automatic:
                     throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
                 case sycl::kernel_invocation_type::basic:
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, mirror_exec.block),
-                                         sycl::detail::basic::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
                 case sycl::kernel_invocation_type::work_group:
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, mirror_exec.block),
-                                         sycl::detail::work_group::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
                 case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, mirror_exec.block);
-                        cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
-#else
-                    throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
                 case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, mirror_exec.block);
-                        cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
-#else
-                    throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                    break;
+                    dispatch_target_platform<sycl::detail::scoped::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
             }
         }
     }
@@ -737,273 +581,21 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::linear>{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
                 break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::kernel_invocation_type::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::kernel_invocation_type::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
+            case sycl::kernel_invocation_type::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
+            case sycl::kernel_invocation_type::scoped:
+                dispatch_target_platform<sycl::detail::scoped::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
         }
     }
     detail::device_synchronize(device);
@@ -1034,36 +626,16 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
             case sycl::kernel_invocation_type::automatic:
                 throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
             case sycl::kernel_invocation_type::basic:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                     sycl::detail::basic::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
+                dispatch_target_platform<sycl::detail::basic::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::work_group:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                     sycl::detail::work_group::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                    cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
+                dispatch_target_platform<sycl::detail::scoped::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
         }
     }
     detail::device_synchronize(device);
@@ -1085,273 +657,38 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::scoped::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
-                            cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
-#endif
-                        break;
-                }
-                break;
+        if (params.kernel_type == kernel_function_type::linear) {
+            switch (invocation_type_) {
+                case sycl::kernel_invocation_type::automatic:
+                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                case sycl::kernel_invocation_type::basic:
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::work_group:
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::hierarchical:
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::scoped:
+                    dispatch_target_platform<sycl::detail::scoped::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+            }
+        } else {
+            switch (invocation_type_) {
+                case sycl::kernel_invocation_type::automatic:
+                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                case sycl::kernel_invocation_type::basic:
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::work_group:
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::hierarchical:
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::scoped:
+                    dispatch_target_platform<sycl::detail::scoped::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+            }
         }
     }
     detail::device_synchronize(device);
diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
index 861344f5b..ef987e6b6 100644
--- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
@@ -63,6 +63,135 @@
 #include <variant>      // std::get
 #include <vector>       // std::vector
 
+namespace {
+
+/**
+ * @brief Run the kernel functor on the given device.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam QueueType the type of the SYCL queue to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] device the SYCL queue to run the kernel on
+ * @param[in] partial_grid the number of work-groups in each dimension of the execution grid
+ * @param[in] block the number of work-items in each dimension per work-group
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <typename KernelFunctor, typename QueueType, typename... Args>
+void run_kernel_functor(const QueueType &device, const plssvm::detail::dim_type partial_grid, const plssvm::detail::dim_type block, Args &&...args) {
+    constexpr plssvm::sycl::kernel_invocation_type invocation = KernelFunctor::invocation_type;
+
+    if constexpr (invocation == plssvm::sycl::kernel_invocation_type::basic) {
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            cgh.parallel_for(plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::basic>(partial_grid, block),
+                             KernelFunctor{ std::forward<Args>(args)... });
+        });
+    } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::work_group) {
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            cgh.parallel_for(plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::work_group>(partial_grid, block),
+                             KernelFunctor{ cgh, std::forward<Args>(args)... });
+        });
+    } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::hierarchical) {
+#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
+        device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
+            const auto exec_range = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::hierarchical>(partial_grid, block);
+            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward<Args>(args)... });
+        });
+#else
+        throw plssvm::dpcpp::backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+#endif
+    } else {
+        static_assert(::plssvm::detail::always_false_v<Args...>, "Unsupported kernel function!");
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam target the target platform to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, plssvm::target_platform target, typename... Args>
+void dispatch_kernel_function_type(const plssvm::parameter &params, Args &&...args) {
+    switch (params.kernel_type) {
+        case plssvm::kernel_function_type::linear:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::linear>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::kernel_function_type::polynomial:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::rbf:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::rbf, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::sigmoid:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::laplacian:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::laplacian, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::chi_squared:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::chi_squared, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform and kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, const plssvm::parameter &params, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::dpcpp::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_nvidia>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_amd>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_intel>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::cpu>(params, std::forward<Args>(args)...);
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::dpcpp::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_nvidia>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_amd>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_intel>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::cpu>>(std::forward<Args>(args)...);
+            break;
+    }
+}
+
+}  // namespace
+
 namespace plssvm::dpcpp {
 
 void csvm::init(const target_platform target) {
@@ -223,9 +352,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size(const std::size_t device_id)
 //                        fit                        //
 //***************************************************//
 
-// TODO: better!
-template <target_platform target, typename device_ptr_type, typename Device, typename Distribution>
-auto dispatch_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost, sycl::kernel_invocation_type invocation_type_, Device& devices_, Distribution& data_distribution_) {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const std::size_t num_rows_reduced = data_d.shape().x - 1;
     const std::size_t num_features = data_d.shape().y;
     const auto &device = devices_[device_id];
@@ -245,220 +372,21 @@ auto dispatch_assemble_kernel_matrix_explicit(const std::size_t device_id, const
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::linear>{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::linear>{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
                 break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::kernel_invocation_type::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::kernel_invocation_type::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly<target, kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::kernel_invocation_type::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
+            case sycl::kernel_invocation_type::scoped:
+                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -469,22 +397,6 @@ auto dispatch_assemble_kernel_matrix_explicit(const std::size_t device_id, const
     return kernel_matrix_d;
 }
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
-    switch (target_) {
-        case target_platform::automatic:
-            // error
-            throw backend_exception{ "Can't determine the target platform!" };
-        case target_platform::gpu_nvidia:
-            return dispatch_assemble_kernel_matrix_explicit<target_platform::gpu_nvidia>(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_);
-        case target_platform::gpu_amd:
-            return dispatch_assemble_kernel_matrix_explicit<target_platform::gpu_amd>(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_);
-        case target_platform::gpu_intel:
-            return dispatch_assemble_kernel_matrix_explicit<target_platform::gpu_intel>(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_);
-        case target_platform::cpu:
-            return dispatch_assemble_kernel_matrix_explicit<target_platform::cpu>(device_id, exec, params, data_d, q_red_d, QA_cost, invocation_type_, devices_, data_distribution_);
-    }
-}
-
 void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const ::plssvm::detail::execution_range &mirror_exec, const real_type alpha, const device_ptr_type &A_d, const device_ptr_type &B_d, const real_type beta, device_ptr_type &C_d) const {
     const std::size_t num_rhs = B_d.shape().x;
     const std::size_t num_rows = B_d.shape().y;
@@ -501,26 +413,13 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
             case sycl::kernel_invocation_type::automatic:
                 throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
             case sycl::kernel_invocation_type::basic:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                     sycl::detail::basic::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
+                dispatch_target_platform<sycl::detail::basic::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::work_group:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                     sycl::detail::work_group::device_kernel_symm{ cgh, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::scoped:
                 throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
@@ -535,26 +434,13 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
                 case sycl::kernel_invocation_type::automatic:
                     throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
                 case sycl::kernel_invocation_type::basic:
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, mirror_exec.block),
-                                         sycl::detail::basic::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
                 case sycl::kernel_invocation_type::work_group:
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, mirror_exec.block),
-                                         sycl::detail::work_group::device_kernel_symm_mirror{ cgh, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
                 case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                    device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                        const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, mirror_exec.block);
-                        cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_symm_mirror{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets_ref.y, offsets_ref.x });
-                    });
-#else
-                    throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
                 case sycl::kernel_invocation_type::scoped:
                     throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
@@ -656,220 +542,21 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::linear>{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::linear>{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+        switch (invocation_type_) {
+            case sycl::kernel_invocation_type::automatic:
+                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
                 break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::kernel_invocation_type::basic:
+                dispatch_target_platform<sycl::detail::basic::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::kernel_invocation_type::work_group:
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_assembly_symm<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
+            case sycl::kernel_invocation_type::hierarchical:
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
+            case sycl::kernel_invocation_type::scoped:
+                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -900,26 +587,13 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
             case sycl::kernel_invocation_type::automatic:
                 throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
             case sycl::kernel_invocation_type::basic:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                     sycl::detail::basic::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
+                dispatch_target_platform<sycl::detail::basic::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::work_group:
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                     sycl::detail::work_group::device_kernel_w_linear{ cgh, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
+                dispatch_target_platform<sycl::detail::work_group::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                    cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_w_linear{ w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets_ref.y, offsets_ref.x });
-                });
-#else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
+                dispatch_target_platform<sycl::detail::hierarchical::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
             case sycl::kernel_invocation_type::scoped:
                 throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
@@ -944,220 +618,38 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (params.kernel_type) {
-            //***************************************************//
-            //               linear kernel function              //
-            //***************************************************//
-            case kernel_function_type::linear:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             sycl::detail::basic::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             sycl::detail::work_group::device_kernel_predict_linear{ cgh, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_predict_linear{ out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets_ref.y, offsets_ref.x });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //             polynomial kernel function            //
-            //***************************************************//
-            case kernel_function_type::polynomial:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //            radial-basis kernel function           //
-            //***************************************************//
-            case kernel_function_type::rbf:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::rbf, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //              sigmoid kernel function              //
-            //***************************************************//
-            case kernel_function_type::sigmoid:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma), params.coef0 });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //             laplacian kernel function             //
-            //***************************************************//
-            case kernel_function_type::laplacian:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::laplacian, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
-            //***************************************************//
-            //            chi-squared kernel function            //
-            //***************************************************//
-            case kernel_function_type::chi_squared:
-                switch (invocation_type_) {
-                    case sycl::kernel_invocation_type::automatic:
-                        throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                    case sycl::kernel_invocation_type::basic:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::basic::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
-                                             functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::work_group:
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::work_group::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
-                                             functor_type{ cgh, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-                        break;
-                    case sycl::kernel_invocation_type::hierarchical:
-#if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-                        device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                            using functor_type = sycl::detail::hierarchical::device_kernel_predict<kernel_function_type::chi_squared, real_type>;
-                            const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
-                            cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), functor_type{ out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets_ref.y, offsets_ref.x, std::get<real_type>(params.gamma) });
-                        });
-#else
-                        throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
-#endif
-                        break;
-                    case sycl::kernel_invocation_type::scoped:
-                        throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
-                }
-                break;
+        if (params.kernel_type == kernel_function_type::linear) {
+            switch (invocation_type_) {
+                case sycl::kernel_invocation_type::automatic:
+                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                case sycl::kernel_invocation_type::basic:
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::work_group:
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::hierarchical:
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::scoped:
+                    throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            }
+        } else {
+            switch (invocation_type_) {
+                case sycl::kernel_invocation_type::automatic:
+                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+                case sycl::kernel_invocation_type::basic:
+                    dispatch_target_platform<sycl::detail::basic::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::work_group:
+                    dispatch_target_platform<sycl::detail::work_group::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::hierarchical:
+                    dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
+                    break;
+                case sycl::kernel_invocation_type::scoped:
+                    throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            }
         }
     }
     detail::device_synchronize(device);

From 8f390d0a3b0959ec7a22aa94738946b837bb9dc9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 22 Jun 2025 18:25:41 +0200
Subject: [PATCH 57/93] Renamed all mentions of SYCL kernel invocation type to
 SYCL data parallel kernel, including the enumeration, to better align with
 the SYCL standard specification.

---
 CMakeLists.txt                                |  12 +-
 README.md                                     |  14 +-
 bindings/Python/README.md                     |  18 +-
 bindings/Python/backends/adaptivecpp_csvm.cpp |  44 ++--
 bindings/Python/backends/dpcpp_csvm.cpp       |  44 ++--
 bindings/Python/backends/sycl.cpp             |  24 +--
 bindings/Python/svm/utility.hpp               |  26 +--
 .../plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp |  28 +--
 .../SYCL/AdaptiveCpp/detail/utility.hpp       |  12 +-
 include/plssvm/backends/SYCL/DPCPP/csvm.hpp   |  30 +--
 .../backends/SYCL/DPCPP/detail/utility.hpp    |  20 +-
 ...on_types.hpp => data_parallel_kernels.hpp} |  36 ++--
 .../SYCL/kernel/cg_explicit/basic/blas.hpp    |  18 +-
 .../basic/kernel_matrix_assembly.hpp          |   6 +-
 .../kernel/cg_explicit/hierarchical/blas.hpp  |  18 +-
 .../hierarchical/kernel_matrix_assembly.hpp   |   6 +-
 .../SYCL/kernel/cg_explicit/scoped/blas.hpp   |  18 +-
 .../scoped/kernel_matrix_assembly.hpp         |   6 +-
 .../kernel/cg_explicit/work_group/blas.hpp    |  18 +-
 .../work_group/kernel_matrix_assembly.hpp     |   6 +-
 .../basic/kernel_matrix_assembly_blas.hpp     |   6 +-
 .../kernel_matrix_assembly_blas.hpp           |   6 +-
 .../scoped/kernel_matrix_assembly_blas.hpp    |   6 +-
 .../kernel_matrix_assembly_blas.hpp           |   6 +-
 .../kernel/predict/basic/predict_kernel.hpp   |  14 +-
 .../predict/hierarchical/predict_kernel.hpp   |  14 +-
 .../kernel/predict/scoped/predict_kernel.hpp  |  14 +-
 .../predict/work_group/predict_kernel.hpp     |  14 +-
 include/plssvm/core.hpp                       |  58 +++---
 include/plssvm/detail/cmd/parser_predict.hpp  |  16 +-
 include/plssvm/detail/cmd/parser_train.hpp    |  26 +--
 include/plssvm/parameter.hpp                  |  10 +-
 src/main_predict.cpp                          |   2 +-
 src/main_train.cpp                            |   2 +-
 .../backends/SYCL/AdaptiveCpp/CMakeLists.txt  |   2 +-
 src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp | 188 ++++++++---------
 src/plssvm/backends/SYCL/CMakeLists.txt       |  10 +-
 src/plssvm/backends/SYCL/DPCPP/csvm.cpp       | 192 +++++++++---------
 ...on_types.cpp => data_parallel_kernels.cpp} |  44 ++--
 src/plssvm/detail/cmd/parser_predict.cpp      |  42 ++--
 src/plssvm/detail/cmd/parser_train.cpp        |  54 ++---
 .../detail/tracking/performance_tracker.cpp   |  15 +-
 tests/CMakeLists.txt                          |   4 +-
 .../backends/SYCL/AdaptiveCpp/CMakeLists.txt  |   4 +-
 .../SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp     |  80 ++++----
 .../SYCL/AdaptiveCpp/detail/utility.cpp       |  10 +-
 tests/backends/SYCL/DPCPP/CMakeLists.txt      |   4 +-
 tests/backends/SYCL/DPCPP/detail/utility.cpp  |   8 +-
 tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp      |  76 +++----
 tests/backends/SYCL/data_parallel_kernels.cpp |  72 +++++++
 .../backends/SYCL/kernel_invocation_types.cpp |  72 -------
 tests/detail/cmd/parser_predict.cpp           |  52 ++---
 tests/detail/cmd/parser_train.cpp             |  62 +++---
 tests/detail/tracking/performance_tracker.cpp |   2 +-
 tests/parameter.cpp                           |  16 +-
 utility_scripts/performance_analysis.py       |   4 +-
 56 files changed, 806 insertions(+), 805 deletions(-)
 rename include/plssvm/backends/SYCL/{kernel_invocation_types.hpp => data_parallel_kernels.hpp} (54%)
 rename src/plssvm/backends/SYCL/{kernel_invocation_types.cpp => data_parallel_kernels.cpp} (52%)
 create mode 100644 tests/backends/SYCL/data_parallel_kernels.cpp
 delete mode 100644 tests/backends/SYCL/kernel_invocation_types.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10de8e060..b57ed26d8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,8 +78,8 @@ endif ()
 # set base sources
 set(PLSSVM_BASE_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/Kokkos/execution_space.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/data_parallel_kernels.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/implementation_types.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/kernel_invocation_types.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/stdpar/implementation_types.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/execution_range.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/data_set/min_max_scaler.cpp
@@ -945,16 +945,16 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_LIBRARY_NAME})
 choose the SYCL implementation to be used in the SYCL backend: ${PLSSVM_SYCL_BACKEND_NAME_LIST} (default: automatic)
 "
     )
-    string(REPLACE ";" "|" PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST "${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST}")
-    set(PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_MANPAGE_ENTRY
+    string(REPLACE ";" "|" PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST "${PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST}")
+    set(PLSSVM_SYCL_DATA_PARALLEL_KERNEL_MANPAGE_ENTRY
         "
 .TP
-.B --sycl_kernel_invocation_type
-choose the kernel invocation type when using SYCL as backend: ${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST} (default: automatic)
+.B --sycl_data_parallel_kernel
+choose the data parallel kernel when using SYCL as backend: ${PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST} (default: automatic)
 "
     )
 endif ()
-set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_MANPAGE_ENTRY}${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}")
+set(PLSSVM_SYCL_MANPAGE_ENTRY "${PLSSVM_SYCL_DATA_PARALLEL_KERNEL_MANPAGE_ENTRY}${PLSSVM_SYCL_IMPLEMENTATION_TYPE_MANPAGE_ENTRY}")
 # assemble the Kokkos manpage entry
 if (TARGET ${PLSSVM_KOKKOS_BACKEND_LIBRARY_NAME})
     string(REPLACE ";" "|" PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES "${PLSSVM_KOKKOS_BACKEND_AVAILABLE_EXECUTION_SPACES}")
diff --git a/README.md b/README.md
index c764ed4d1..d07f0599e 100644
--- a/README.md
+++ b/README.md
@@ -347,7 +347,7 @@ If the SYCL backend is available, additional options can be set.
   - `AUTO`: check for DPC++/icpx as implementation for the SYCL backend but **do not** fail if not available
   - `OFF`: do not check for DPC++/icpx as implementation for the SYCL backend
 
-- `PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS` (default: `ON`): enable SYCL's `hierarchical` and AdaptiveCpp's `scoped` kernel invocation types
+- `PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS` (default: `ON`): enable SYCL's `hierarchical` data parallel kernel and AdaptiveCpp's `scoped` parallelism 
 
 To use DPC++/icpx for SYCL, simply set the `CMAKE_CXX_COMPILER` to the respective DPC++/icpx clang executable during CMake invocation.
 
@@ -686,8 +686,8 @@ Usage:
   -a, --classification arg      the classification strategy to use for multi-class classification: oaa|oao (default: oaa)
   -b, --backend arg             choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic)
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
-      --sycl_kernel_invocation_type arg
-                                choose the kernel invocation type when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic)
+      --sycl_data_parallel_kernel arg
+                                choose the data parallel kernel when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic)
       --sycl_implementation_type arg
                                 choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic)
       --kokkos_execution_space arg
@@ -747,8 +747,8 @@ The `--target_platform=automatic` option works for the different backends as fol
 - `Kokkos`: checks which execution spaces are available and which target platforms they support and then tries to find available devices in the following order: NVIDIA GPUs 🠦 AMD GPUs 🠦 Intel GPUs 🠦 CPU
 - `stdpar`: target device must be selected at compile time (using `PLSSVM_TARGET_PLATFORMS`) or using environment variables at runtime
 
-The `--sycl_kernel_invocation_type` and `--sycl_implementation_type` flags are only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`.
-If the `--sycl_kernel_invocation_type` is `automatic`, the `work_group` invocation type is currently always used.
+The `--sycl_data_parallel_kernel` and `--sycl_implementation_type` flags are only used if the `--backend` is `sycl`, otherwise a warning is emitted on `stderr`.
+If the `--sycl_data_parallel_kernel` is `automatic`, the `work_group` data parallel kernels are currently always used.
 If the `--sycl_implementation_type` is `automatic`, the used SYCL implementation is determined by the `PLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` CMake flag.
 If the `--kokkos_execution_space` is `automatic`, uses the best fitting execution space based on the provided and/or available target platforms.
 
@@ -796,8 +796,8 @@ Usage:
 
   -b, --backend arg             choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic)
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
-      --sycl_kernel_invocation_type arg
-                                choose the kernel invocation type when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic)
+      --sycl_data_parallel_kernel arg
+                                choose the data parallel kernel when using SYCL as backend: automatic|basic|work_group|hierarchical|scoped (default: automatic)
       --sycl_implementation_type arg
                                 choose the SYCL implementation to be used in the SYCL backend: automatic|dpcpp|adaptivecpp (default: automatic)
       --kokkos_execution_space arg
diff --git a/bindings/Python/README.md b/bindings/Python/README.md
index 504d2533b..96254920b 100644
--- a/bindings/Python/README.md
+++ b/bindings/Python/README.md
@@ -332,10 +332,10 @@ The following table lists all PLSSVM enumerations exposed on the Python side:
 
 If a SYCL implementation is available, additional enumerations are available:
 
-| enumeration            | values                                                       | description                                                                                                                                                                                                                                               |
-|------------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `ImplementationType`   | `AUTOMATIC`, `DPCPP`, `ADAPTIVECPP`                          | The different supported SYCL implementation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, determines the used SYCL implementation based on the value of `-DPLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` provided during PLSSVM'S build step. |
-| `KernelInvocationType` | `AUTOMATIC`, `BASIC`, `WORK_GROUP`, `HIERARCHICAL`, `SCOPED` | The different supported SYCL kernel invocation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, simply uses `WORK_GROUP`.                                                                                                                        |
+| enumeration          | values                                                       | description                                                                                                                                                                                                                                               |
+|----------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `ImplementationType` | `AUTOMATIC`, `DPCPP`, `ADAPTIVECPP`                          | The different supported SYCL implementation types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, determines the used SYCL implementation based on the value of `-DPLSSVM_SYCL_BACKEND_PREFERRED_IMPLEMENTATION` provided during PLSSVM'S build step. |
+| `DataParallelKernel` | `AUTOMATIC`, `BASIC`, `WORK_GROUP`, `HIERARCHICAL`, `SCOPED` | The different supported SYCL data parallel kernels (default: `AUTOMATIC`). If `AUTOMATIC` is provided, simply uses `WORK_GROUP`.                                                                                                                          |
 
 If the stdpar backend is available, an additional enumeration is available:
 
@@ -469,7 +469,7 @@ The following constructors and methods are available for both classification `CS
 
 **Note**: if the backend type is `plssvm.BackendType.SYCL` two additional named parameters can be provided:
 `sycl_implementation_type` to choose between DPC++ and AdaptiveCpp as SYCL implementations
-and `sycl_kernel_invocation_type` to choose between the two different SYCL kernel invocation types.
+and `sycl_data_parallel_kernel` to choose between the different SYCL data parallel kernels.
 
 **Note**: if the backend type is `plssvm.BackendType.HPX` or `plssvm.BackendType.Kokkos` special initialization and
 finalization functions must be called.
@@ -519,12 +519,12 @@ The following constructors and methods are available for both classification `CS
 | `CSVC(target, *, kernel_type=plssvm.KernelFunctionType.RBF, degree=3, gamma=plssvm.GammaCoefficientType.AUTO, coef0=0.0, cost=1.0, comm=*used MPI communicator*)` | Create a new C-SVM with the provided parameters and named arguments.                |
 
 In case of the SYCL C-SVMs (`plssvm.sycl.CSVM`, `plssvm.dpcpp.CSVM`, and `plssvm.adaptivecpp.CSVM`; the same for the 
-`CSVR`s), additionally, all constructors also accept the SYCL specific `sycl_kernel_invocation_type` keyword parameter.
+`CSVR`s), additionally, all constructors also accept the SYCL specific `sycl_data_parallel_kernel` keyword parameter.
 Also, the following method is additional available for the backend specific C-SVM:
 
-| methods                        | description                             |
-|--------------------------------|-----------------------------------------|
-| `get_kernel_invocation_type()` | Return the SYCL kernel invocation type. |
+| methods                      | description                                |
+|------------------------------|--------------------------------------------|
+| `get_data_parallel_kernel()` | Return the used SYCL data parallel kernel. |
 
 In case of the stdpar C-SVM (`plssvm.stdpar.CSVC` and `plssvm.stdpar.CSVR`) the following method is additional available for the backend specific
 C-SVM.
diff --git a/bindings/Python/backends/adaptivecpp_csvm.cpp b/bindings/Python/backends/adaptivecpp_csvm.cpp
index bf43d85f1..f9cc57b3a 100644
--- a/bindings/Python/backends/adaptivecpp_csvm.cpp
+++ b/bindings/Python/backends/adaptivecpp_csvm.cpp
@@ -6,20 +6,20 @@
  *          See the LICENSE.md file in the project root for full license information.
  */
 
-#include "plssvm/backend_types.hpp"                          // plssvm::adaptivecpp::backend_csvm_type_t
-#include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp"         // plssvm::adaptivecpp::csvm
-#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::adaptivecpp::backend_exception
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::exception
-#include "plssvm/gamma.hpp"                                  // plssvm::gamma
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/parameter.hpp"                              // plssvm::parameter
-#include "plssvm/svm/csvc.hpp"                               // plssvm::csvc
-#include "plssvm/svm/csvm.hpp"                               // plssvm::csvm
-#include "plssvm/svm/csvr.hpp"                               // plssvm::csvr
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::adaptivecpp::backend_csvm_type_t
+#include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp"       // plssvm::adaptivecpp::csvm
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/exceptions.hpp"             // plssvm::adaptivecpp::backend_exception
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::exception
+#include "plssvm/gamma.hpp"                                // plssvm::gamma
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/parameter.hpp"                            // plssvm::parameter
+#include "plssvm/svm/csvc.hpp"                             // plssvm::csvc
+#include "plssvm/svm/csvm.hpp"                             // plssvm::csvm
+#include "plssvm/svm/csvr.hpp"                             // plssvm::csvr
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "bindings/Python/type_caster/mpi_type_caster.hpp"  // a custom Pybind11 type caster for a plssvm::mpi::communicator
 #include "bindings/Python/utility.hpp"                      // plssvm::bindings::python::util::register_py_exception
@@ -49,18 +49,18 @@ void bind_adaptivecpp_csvms(py::module_ &m, const std::string &csvm_name) {
     const std::string keyword_args_constructor_docstring{ fmt::format("create an AdaptiveCpp SYCL {} with the provided SVM parameter as separate keyword arguments including optional SYCL specific keyword arguments", csvm_name) };
 
     py::class_<backend_csvm_type, plssvm::adaptivecpp::csvm, csvm_type>(m, csvm_name.c_str(), class_docstring.c_str())
-        .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) {
-                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation);
+        .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) {
+                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type);
              }),
              params_constructor_docstring.c_str(),
              py::arg("target") = plssvm::target_platform::automatic,
              py::kw_only(),
              py::arg("params") = default_params,
-             py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic,
+             py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic,
              py::arg("comm") = plssvm::mpi::communicator{})
-        .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) {
+        .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) {
                  const plssvm::parameter params{ kernel_type, degree, gamma, coef0, cost };
-                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation);
+                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type);
              }),
              keyword_args_constructor_docstring.c_str(),
              py::arg("target") = plssvm::target_platform::automatic,
@@ -70,11 +70,11 @@ void bind_adaptivecpp_csvms(py::module_ &m, const std::string &csvm_name) {
              py::arg("gamma") = default_params.gamma,
              py::arg("coef0") = default_params.coef0,
              py::arg("cost") = default_params.cost,
-             py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic,
+             py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic,
              py::arg("comm") = plssvm::mpi::communicator{})
-        .def("get_kernel_invocation_type", &plssvm::adaptivecpp::csvm::get_kernel_invocation_type, "get the kernel invocation type used in this SYCL C-SVM")
+        .def("get_data_parallel_kernel", &plssvm::adaptivecpp::csvm::get_data_parallel_kernel, "get the data parallel kernel used in this SYCL C-SVM")
         .def("__repr__", [csvm_name](const backend_csvm_type &self) {
-            return fmt::format("<plssvm.adaptivecpp.{} with {{ #devices: {}, kernel_invocation_type: {} }}>", csvm_name, self.num_available_devices(), self.get_kernel_invocation_type());
+            return fmt::format("<plssvm.adaptivecpp.{} with {{ #devices: {}, data_parallel_kernel: {} }}>", csvm_name, self.num_available_devices(), self.get_data_parallel_kernel());
         });
 }
 
diff --git a/bindings/Python/backends/dpcpp_csvm.cpp b/bindings/Python/backends/dpcpp_csvm.cpp
index 51dcd7e16..152849bd7 100644
--- a/bindings/Python/backends/dpcpp_csvm.cpp
+++ b/bindings/Python/backends/dpcpp_csvm.cpp
@@ -6,20 +6,20 @@
  *          See the LICENSE.md file in the project root for full license information.
  */
 
-#include "plssvm/backend_types.hpp"                          // plssvm::dpcpp::backend_csvm_type_t
-#include "plssvm/backends/SYCL/DPCPP/csvm.hpp"               // plssvm::dpcpp::csvm
-#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::dpcpp::backend_exception
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::exception
-#include "plssvm/gamma.hpp"                                  // plssvm::gamma
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/parameter.hpp"                              // plssvm::parameter
-#include "plssvm/svm/csvc.hpp"                               // plssvm::csvc
-#include "plssvm/svm/csvm.hpp"                               // plssvm::csvm
-#include "plssvm/svm/csvr.hpp"                               // plssvm::csvr
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::dpcpp::backend_csvm_type_t
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/DPCPP/csvm.hpp"             // plssvm::dpcpp::csvm
+#include "plssvm/backends/SYCL/exceptions.hpp"             // plssvm::dpcpp::backend_exception
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::exception
+#include "plssvm/gamma.hpp"                                // plssvm::gamma
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/parameter.hpp"                            // plssvm::parameter
+#include "plssvm/svm/csvc.hpp"                             // plssvm::csvc
+#include "plssvm/svm/csvm.hpp"                             // plssvm::csvm
+#include "plssvm/svm/csvr.hpp"                             // plssvm::csvr
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "bindings/Python/type_caster/mpi_type_caster.hpp"  // a custom Pybind11 type caster for a plssvm::mpi::communicator
 #include "bindings/Python/utility.hpp"                      // plssvm::bindings::python::util::register_py_exception
@@ -49,18 +49,18 @@ void bind_dpcpp_csvms(py::module_ &m, const std::string &csvm_name) {
     const std::string keyword_args_constructor_docstring{ fmt::format("create a DPC++ SYCL {} with the provided SVM parameter as separate keyword arguments including optional SYCL specific keyword arguments", csvm_name) };
 
     py::class_<backend_csvm_type, plssvm::dpcpp::csvm, csvm_type>(m, csvm_name.c_str(), class_docstring.c_str())
-        .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) {
-                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation);
+        .def(py::init([](const plssvm::target_platform target, const plssvm::parameter params, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) {
+                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type);
              }),
              params_constructor_docstring.c_str(),
              py::arg("target") = plssvm::target_platform::automatic,
              py::kw_only(),
              py::arg("params") = default_params,
-             py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic,
+             py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic,
              py::arg("comm") = plssvm::mpi::communicator{})
-        .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::kernel_invocation_type invocation, plssvm::mpi::communicator comm) {
+        .def(py::init([](const plssvm::target_platform target, const plssvm::kernel_function_type kernel_type, const int degree, const plssvm::gamma_type gamma, const plssvm::real_type coef0, const plssvm::real_type cost, const plssvm::sycl::data_parallel_kernel data_parallel_kernel_type, plssvm::mpi::communicator comm) {
                  const plssvm::parameter params{ kernel_type, degree, gamma, coef0, cost };
-                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_kernel_invocation_type = invocation);
+                 return std::make_unique<backend_csvm_type>(std::move(comm), target, params, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type);
              }),
              keyword_args_constructor_docstring.c_str(),
              py::arg("target") = plssvm::target_platform::automatic,
@@ -70,11 +70,11 @@ void bind_dpcpp_csvms(py::module_ &m, const std::string &csvm_name) {
              py::arg("gamma") = default_params.gamma,
              py::arg("coef0") = default_params.coef0,
              py::arg("cost") = default_params.cost,
-             py::arg("sycl_kernel_invocation_type") = plssvm::sycl::kernel_invocation_type::automatic,
+             py::arg("sycl_data_parallel_kernel") = plssvm::sycl::data_parallel_kernel::automatic,
              py::arg("comm") = plssvm::mpi::communicator{})
-        .def("get_kernel_invocation_type", &plssvm::dpcpp::csvm::get_kernel_invocation_type, "get the kernel invocation type used in this SYCL C-SVM")
+        .def("get_data_parallel_kernel", &plssvm::dpcpp::csvm::get_data_parallel_kernel, "get the data parallel kernel used in this SYCL C-SVM")
         .def("__repr__", [csvm_name](const backend_csvm_type &self) {
-            return fmt::format("<plssvm.dpcpp.{} with {{ #devices: {}, kernel_invocation_type: {} }}>", csvm_name, self.num_available_devices(), self.get_kernel_invocation_type());
+            return fmt::format("<plssvm.dpcpp.{} with {{ #devices: {}, data_parallel_kernel: {} }}>", csvm_name, self.num_available_devices(), self.get_data_parallel_kernel());
         });
 }
 
diff --git a/bindings/Python/backends/sycl.cpp b/bindings/Python/backends/sycl.cpp
index 98c27214b..3bf6b6c30 100644
--- a/bindings/Python/backends/sycl.cpp
+++ b/bindings/Python/backends/sycl.cpp
@@ -6,10 +6,10 @@
  *          See the LICENSE.md file in the project root for full license information.
  */
 
-#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::sycl::backend_exception
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::{implementation_type, list_available_sycl_implementations}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::exception
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/exceptions.hpp"             // plssvm::sycl::backend_exception
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::{implementation_type, list_available_sycl_implementations}
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::exception
 
 #include "bindings/Python/utility.hpp"  // plssvm::bindings::python::util::{register_py_exception, register_implicit_str_enum_conversion}
 
@@ -45,16 +45,16 @@ void init_sycl(py::module_ &m, const py::exception<plssvm::exception> &base_exce
 
     sycl_module.def("list_available_sycl_implementations", &plssvm::sycl::list_available_sycl_implementations, "list all available SYCL implementations");
 
-    py::enum_<plssvm::sycl::kernel_invocation_type> py_enum_invocation(sycl_module, "KernelInvocationType", "Enum class for all possible SYCL kernel invocation types supported in PLSSVM.");
-    py_enum_invocation
-        .value("AUTOMATIC", plssvm::sycl::kernel_invocation_type::automatic, "use the best kernel invocation type for the current SYCL implementation and target hardware platform")
-        .value("BASIC", plssvm::sycl::kernel_invocation_type::basic, "use the basic data parallel kernel invocation type")
-        .value("WORK_GROUP", plssvm::sycl::kernel_invocation_type::work_group, "use the work-group data parallel kernel invocation type")
-        .value("HIERARCHICAL", plssvm::sycl::kernel_invocation_type::hierarchical, "use the hierarchical data parallel kernel invocation type")
-        .value("SCOPED", plssvm::sycl::kernel_invocation_type::scoped, "use the AdaptiveCpp specific scoped parallelism kernel invocation type");
+    py::enum_<plssvm::sycl::data_parallel_kernel> py_enum_data_parallel_kernel(sycl_module, "DataParallelKernel", "Enum class for all possible SYCL data parallel kernels supported in PLSSVM.");
+    py_enum_data_parallel_kernel
+        .value("AUTOMATIC", plssvm::sycl::data_parallel_kernel::automatic, "use the best data parallel kernel for the current SYCL implementation and target hardware platform")
+        .value("BASIC", plssvm::sycl::data_parallel_kernel::basic, "use the basic data parallel kernel")
+        .value("WORK_GROUP", plssvm::sycl::data_parallel_kernel::work_group, "use the work-group data parallel kernel")
+        .value("HIERARCHICAL", plssvm::sycl::data_parallel_kernel::hierarchical, "use the hierarchical data parallel kernel")
+        .value("SCOPED", plssvm::sycl::data_parallel_kernel::scoped, "use the AdaptiveCpp specific scoped parallelism kernel");
 
     // enable implicit conversion from string to enum
-    plssvm::bindings::python::util::register_implicit_str_enum_conversion<plssvm::sycl::kernel_invocation_type>(py_enum_invocation);
+    plssvm::bindings::python::util::register_implicit_str_enum_conversion<plssvm::sycl::data_parallel_kernel>(py_enum_data_parallel_kernel);
 
     // initialize SYCL binding classes
 #if defined(PLSSVM_SYCL_BACKEND_HAS_ADAPTIVECPP)
diff --git a/bindings/Python/svm/utility.hpp b/bindings/Python/svm/utility.hpp
index 38019bf8b..5b3bb823a 100644
--- a/bindings/Python/svm/utility.hpp
+++ b/bindings/Python/svm/utility.hpp
@@ -13,14 +13,14 @@
 #define PLSSVM_BINDINGS_PYTHON_SVM_UTILITY_HPP_
 #pragma once
 
-#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/csvm_factory.hpp"                           // plssvm::make_csvm
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/parameter.hpp"                              // plssvm::parameter, named arguments
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/csvm_factory.hpp"                         // plssvm::make_csvm
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/parameter.hpp"                            // plssvm::parameter, named arguments
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "bindings/Python/utility.hpp"  // plssvm::bindings::python::util::check_kwargs_for_correctness
 
@@ -46,7 +46,7 @@ namespace plssvm::bindings::python::util {
 template <typename csvm_type>
 [[nodiscard]] inline std::unique_ptr<csvm_type> assemble_csvm(const plssvm::backend_type backend, const plssvm::target_platform target, const plssvm::parameter &params, plssvm::mpi::communicator comm, const py::kwargs &optional_args) {
     // check keyword arguments
-    plssvm::bindings::python::util::check_kwargs_for_correctness(optional_args, { "foo", "sycl_implementation_type", "sycl_kernel_invocation_type", "kokkos_execution_space" });
+    plssvm::bindings::python::util::check_kwargs_for_correctness(optional_args, { "foo", "sycl_implementation_type", "sycl_data_parallel_kernel", "kokkos_execution_space" });
 
     if (backend == plssvm::backend_type::sycl) {
         // parse SYCL specific keyword arguments
@@ -54,12 +54,12 @@ template <typename csvm_type>
         if (optional_args.contains("sycl_implementation_type")) {
             impl_type = optional_args["sycl_implementation_type"].cast<plssvm::sycl::implementation_type>();
         }
-        plssvm::sycl::kernel_invocation_type invocation_type = plssvm::sycl::kernel_invocation_type::automatic;
-        if (optional_args.contains("sycl_kernel_invocation_type")) {
-            invocation_type = optional_args["sycl_kernel_invocation_type"].cast<plssvm::sycl::kernel_invocation_type>();
+        plssvm::sycl::data_parallel_kernel data_parallel_kernel_type = plssvm::sycl::data_parallel_kernel::automatic;
+        if (optional_args.contains("sycl_data_parallel_kernel")) {
+            data_parallel_kernel_type = optional_args["sycl_data_parallel_kernel"].cast<plssvm::sycl::data_parallel_kernel>();
         }
 
-        return plssvm::make_csvm<csvm_type>(backend, std::move(comm), target, params, plssvm::sycl_implementation_type = impl_type, plssvm::sycl_kernel_invocation_type = invocation_type);
+        return plssvm::make_csvm<csvm_type>(backend, std::move(comm), target, params, plssvm::sycl_implementation_type = impl_type, plssvm::sycl_data_parallel_kernel = data_parallel_kernel_type);
     } else if (backend == plssvm::backend_type::kokkos) {
         // parse Kokkos specific keyword arguments
         plssvm::kokkos::execution_space space = plssvm::kokkos::execution_space::automatic;
diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
index 55b6a746b..8244fcedd 100644
--- a/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
+++ b/include/plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp
@@ -18,7 +18,7 @@
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp"     // plssvm::adaptivecpp::detail::device_ptr
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/pinned_memory.hpp"  // plssvm::adaptivecpp::detail::pinned_memory
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue.hpp"          // plssvm::adaptivecpp::detail::queue (PImpl)
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"           // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"             // plssvm::sycl::data_parallel_kernel
 #include "plssvm/constants.hpp"                                       // plssvm::real_type
 #include "plssvm/detail/igor_utility.hpp"                             // plssvm::detail::get_value_from_named_parameter
 #include "plssvm/detail/memory_size.hpp"                              // plssvm::detail::memory_size
@@ -61,7 +61,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
 
     /**
      * @brief Construct a new C-SVM using the SYCL backend on the @p target platform and the optionally provided @p named_args.
-     * @details Additionally sets the SYCL specific kernel invocation type.
+     * @details Additionally sets the SYCL specific data parallel kernel.
      * @param[in] target the target platform used for this C-SVM
      * @param[in] named_args the additional optional named arguments
      * @throws plssvm::exception all exceptions thrown in the base class constructor
@@ -73,16 +73,16 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
         // check igor parameter
         igor::parser parser{ std::forward<Args>(named_args)... };
 
-        // check whether a specific SYCL kernel invocation type has been requested
-        if constexpr (parser.has(sycl_kernel_invocation_type)) {
+        // check whether a specific SYCL data parallel kernel has been requested
+        if constexpr (parser.has(sycl_data_parallel_kernel)) {
             // compile time check: the value must have the correct type
-            invocation_type_ = ::plssvm::detail::get_value_from_named_parameter<sycl::kernel_invocation_type>(parser, sycl_kernel_invocation_type);
+            data_parallel_kernel_type_ = ::plssvm::detail::get_value_from_named_parameter<sycl::data_parallel_kernel>(parser, sycl_data_parallel_kernel);
 
 #if !defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-            if (invocation_type_ == sycl::kernel_invocation_type::hierarchical) {
-                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::hierarchical is disabled for the AdaptiveCpp SYCL backend!" };
-            } else if (invocation_type_ == sycl::kernel_invocation_type::scoped) {
-                throw ::plssvm::invalid_parameter_exception{ "he provided sycl::kernel_invocation_type::scoped is disabled for the AdaptiveCpp SYCL backend!" };
+            if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::hierarchical) {
+                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::data_parallel_kernel::hierarchical is disabled for the AdaptiveCpp SYCL backend!" };
+            } else if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::scoped) {
+                throw ::plssvm::invalid_parameter_exception{ "he provided sycl::data_parallel_kernel::scoped is disabled for the AdaptiveCpp SYCL backend!" };
             }
 #endif
         }
@@ -112,10 +112,10 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
     ~csvm() override = 0;
 
     /**
-     * @brief Return the kernel invocation type used in this SYCL SVM.
-     * @return the SYCL kernel invocation type (`[[nodiscard]]`)
+     * @brief Return the data parallel kernel used in this SYCL SVM.
+     * @return the SYCL data parallel kernel (`[[nodiscard]]`)
      */
-    [[nodiscard]] sycl::kernel_invocation_type get_kernel_invocation_type() const noexcept { return invocation_type_; }
+    [[nodiscard]] sycl::data_parallel_kernel get_data_parallel_kernel() const noexcept { return data_parallel_kernel_type_; }
 
   protected:
     /**
@@ -180,8 +180,8 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
      */
     [[nodiscard]] device_ptr_type run_predict_kernel(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const final;
 
-    /// The SYCL kernel invocation type for the svm kernel.
-    sycl::kernel_invocation_type invocation_type_{ sycl::kernel_invocation_type::automatic };
+    /// The used SYCL data parallel kernel.
+    sycl::data_parallel_kernel data_parallel_kernel_type_{ sycl::data_parallel_kernel::automatic };
 };
 
 /**
diff --git a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp
index 23ffb1872..32164ed8b 100644
--- a/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp
+++ b/include/plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp
@@ -15,7 +15,7 @@
 
 #include "plssvm/backends/execution_range.hpp"                // plssvm::detail::dim_type
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue.hpp"  // plssvm::adaptivecpp::detail::queue (PImpl)
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"   // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"     // plssvm::sycl::data_parallel_kernel
 #include "plssvm/detail/utility.hpp"                          // plssvm::detail::unreachable
 #include "plssvm/target_platforms.hpp"                        // plssvm::target_platform
 
@@ -50,21 +50,21 @@ template <std::size_t I>
 
 /**
  * @brief Convert the provided @p grid and @p block to the final SYCL execution range.
- * @tparam invocation_type the SYCL kernel invocation type
+ * @tparam kernel_type the SYCL data parallel kernel
  * @param[in] grid the execution grid
  * @param[in] block the execution block
  * @return the SYCL native execution range
  */
-template <sycl::kernel_invocation_type invocation_type>
+template <sycl::data_parallel_kernel kernel_type>
 auto get_execution_range(const ::plssvm::detail::dim_type &grid, const ::plssvm::detail::dim_type &block) {
     const ::sycl::range native_grid = detail::dim_type_to_native<2>(grid);
     const ::sycl::range native_block = detail::dim_type_to_native<2>(block);
 
-    if constexpr (invocation_type == sycl::kernel_invocation_type::basic) {
+    if constexpr (kernel_type == sycl::data_parallel_kernel::basic) {
         return ::sycl::range<2>{ native_grid * native_block };
-    } else if constexpr (invocation_type == sycl::kernel_invocation_type::work_group) {
+    } else if constexpr (kernel_type == sycl::data_parallel_kernel::work_group) {
         return ::sycl::nd_range<2>{ native_grid * native_block, native_block };
-    } else if constexpr (invocation_type == sycl::kernel_invocation_type::hierarchical || invocation_type == sycl::kernel_invocation_type::scoped) {
+    } else if constexpr (kernel_type == sycl::data_parallel_kernel::hierarchical || kernel_type == sycl::data_parallel_kernel::scoped) {
         return ::sycl::nd_range<2>{ native_grid, native_block };
     } else {
         // can't be reached
diff --git a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
index 4b1a6b570..3a82f5e6c 100644
--- a/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
+++ b/include/plssvm/backends/SYCL/DPCPP/csvm.hpp
@@ -15,10 +15,10 @@
 
 #include "plssvm/backends/execution_range.hpp"                  // plssvm::detail::{dim_type, execution_range}
 #include "plssvm/backends/gpu_csvm.hpp"                         // plssvm::detail::gpu_csvm
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"       // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp"     // plssvm::dpcpp::detail::device_ptr
 #include "plssvm/backends/SYCL/DPCPP/detail/pinned_memory.hpp"  // plssvm::dpcpp::detail::pinned_memory
 #include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp"          // plssvm::dpcpp::detail::queue (PImpl)
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"     // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                                 // plssvm::real_type
 #include "plssvm/detail/igor_utility.hpp"                       // plssvm::detail::get_value_from_named_parameter
 #include "plssvm/detail/memory_size.hpp"                        // plssvm::detail::memory_size
@@ -64,7 +64,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
      * @param[in] target the target platform used for this C-SVM
      * @param[in] named_args the additional optional named arguments
      * @throws plssvm::exception all exceptions thrown in the base class constructor
-     * @throws plssvm::invalid_parameter_exception the provided SYCL kernel invocation type is "scoped"
+     * @throws plssvm::invalid_parameter_exception the provided SYCL data parallel kernel is "scoped"
      * @throws plssvm::dpcpp::backend_exception if the requested target is not available
      * @throws plssvm::dpcpp::backend_exception if no device for the requested target was found
      */
@@ -73,18 +73,18 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
         // check igor parameter
         igor::parser parser{ std::forward<Args>(named_args)... };
 
-        // check whether a specific SYCL kernel invocation type has been requested
-        if constexpr (parser.has(sycl_kernel_invocation_type)) {
+        // check whether a specific SYCL data parallel kernel has been requested
+        if constexpr (parser.has(sycl_data_parallel_kernel)) {
             // compile time check: the value must have the correct type
-            invocation_type_ = ::plssvm::detail::get_value_from_named_parameter<sycl::kernel_invocation_type>(parser, sycl_kernel_invocation_type);
-            // the invocation type "scoped" isn't supported by DPC++
-            if (invocation_type_ == sycl::kernel_invocation_type::scoped) {
-                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::scoped isn't supported by DPC++!" };
+            data_parallel_kernel_type_ = ::plssvm::detail::get_value_from_named_parameter<sycl::data_parallel_kernel>(parser, sycl_data_parallel_kernel);
+            // the data parallel kernel "scoped" isn't supported by DPC++
+            if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::scoped) {
+                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::data_parallel_kernel::scoped isn't supported by DPC++!" };
             }
 
 #if !defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-            if (invocation_type_ == sycl::kernel_invocation_type::hierarchical) {
-                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::kernel_invocation_type::hierarchical is disabled for the DPC++ SYCL backend!" };
+            if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::hierarchical) {
+                throw ::plssvm::invalid_parameter_exception{ "The provided sycl::data_parallel_kernel::hierarchical is disabled for the DPC++ SYCL backend!" };
             }
 #endif
         }
@@ -114,10 +114,10 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
     ~csvm() override = 0;
 
     /**
-     * @brief Return the kernel invocation type used in this SYCL SVM.
-     * @return the SYCL kernel invocation type (`[[nodiscard]]`)
+     * @brief Return the data parallel kernel used in this SYCL SVM.
+     * @return the SYCL data parallel kernel (`[[nodiscard]]`)
      */
-    [[nodiscard]] sycl::kernel_invocation_type get_kernel_invocation_type() const noexcept { return invocation_type_; }
+    [[nodiscard]] sycl::data_parallel_kernel get_data_parallel_kernel() const noexcept { return data_parallel_kernel_type_; }
 
   protected:
     /**
@@ -183,8 +183,8 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::queue
      */
     [[nodiscard]] device_ptr_type run_predict_kernel(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &alpha_d, const device_ptr_type &rho_d, const device_ptr_type &sv_or_w_d, const device_ptr_type &predict_points_d) const final;
 
-    /// The SYCL kernel invocation type for the svm kernel.
-    sycl::kernel_invocation_type invocation_type_{ sycl::kernel_invocation_type::automatic };
+    /// The used SYCL data parallel kernel.
+    sycl::data_parallel_kernel data_parallel_kernel_type_{ sycl::data_parallel_kernel::automatic };
 };
 
 /**
diff --git a/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp b/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp
index d61a73407..2adead55f 100644
--- a/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp
+++ b/include/plssvm/backends/SYCL/DPCPP/detail/utility.hpp
@@ -13,11 +13,11 @@
 #define PLSSVM_BACKENDS_SYCL_DPCPP_DETAIL_UTILITY_HPP_
 #pragma once
 
-#include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
-#include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp"       // plssvm::dpcpp::detail::queue (PImpl)
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/detail/utility.hpp"                         // plssvm::detail::unreachable
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backends/execution_range.hpp"             // plssvm::detail::dim_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/DPCPP/detail/queue.hpp"     // plssvm::dpcpp::detail::queue (PImpl)
+#include "plssvm/detail/utility.hpp"                       // plssvm::detail::unreachable
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range
 
@@ -51,21 +51,21 @@ template <std::size_t I>
 
 /**
  * @brief Convert the provided @p grid and @p block to the final SYCL execution range.
- * @tparam invocation_type the SYCL kernel invocation type
+ * @tparam kernel_type the SYCL data parallel kernel
  * @param[in] grid the execution grid
  * @param[in] block the execution block
  * @return the SYCL native execution range
  */
-template <sycl::kernel_invocation_type invocation_type>
+template <sycl::data_parallel_kernel kernel_type>
 auto get_execution_range(const ::plssvm::detail::dim_type &grid, const ::plssvm::detail::dim_type &block) {
     const ::sycl::range native_grid = detail::dim_type_to_native<2>(grid);
     const ::sycl::range native_block = detail::dim_type_to_native<2>(block);
 
-    if constexpr (invocation_type == sycl::kernel_invocation_type::basic) {
+    if constexpr (kernel_type == sycl::data_parallel_kernel::basic) {
         return ::sycl::range<2>{ native_grid * native_block };
-    } else if constexpr (invocation_type == sycl::kernel_invocation_type::work_group) {
+    } else if constexpr (kernel_type == sycl::data_parallel_kernel::work_group) {
         return ::sycl::nd_range<2>{ native_grid * native_block, native_block };
-    } else if constexpr (invocation_type == sycl::kernel_invocation_type::hierarchical) {
+    } else if constexpr (kernel_type == sycl::data_parallel_kernel::hierarchical) {
         return ::sycl::nd_range<2>{ native_grid, native_block };
     } else {
         // can't be reached
diff --git a/include/plssvm/backends/SYCL/kernel_invocation_types.hpp b/include/plssvm/backends/SYCL/data_parallel_kernels.hpp
similarity index 54%
rename from include/plssvm/backends/SYCL/kernel_invocation_types.hpp
rename to include/plssvm/backends/SYCL/data_parallel_kernels.hpp
index d7cec1924..ede8ee3fb 100644
--- a/include/plssvm/backends/SYCL/kernel_invocation_types.hpp
+++ b/include/plssvm/backends/SYCL/data_parallel_kernels.hpp
@@ -6,11 +6,11 @@
  * @license This file is part of the PLSSVM project which is released under the MIT license.
  *          See the LICENSE.md file in the project root for full license information.
  *
- * @brief Defines an enumeration holding all possible SYCL kernel invocation types.
+ * @brief Defines an enumeration holding all possible SYCL data parallel kernels.
  */
 
-#ifndef PLSSVM_BACKENDS_SYCL_KERNEL_INVOCATION_TYPE_HPP_
-#define PLSSVM_BACKENDS_SYCL_KERNEL_INVOCATION_TYPE_HPP_
+#ifndef PLSSVM_BACKENDS_SYCL_DATA_PARALLEL_KERNELS_HPP_
+#define PLSSVM_BACKENDS_SYCL_DATA_PARALLEL_KERNELS_HPP_
 #pragma once
 
 #include "fmt/base.h"     // fmt::formatter
@@ -22,10 +22,10 @@
 namespace plssvm::sycl {
 
 /**
- * @brief Enum class for all possible SYCL kernel invocation types.
+ * @brief Enum class for all possible SYCL data parallel kernels.
  */
-enum class kernel_invocation_type {
-    /** Use the best kernel invocation type for the current SYCL implementation and target hardware platform. */
+enum class data_parallel_kernel {
+    /** Use the best data parallel kernel for the current SYCL implementation and target hardware platform. In practice, will nearly always map to work-group data parallel kernels. */
     automatic,
     /** Use the [`basic` data parallel kernels](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_basic_data_parallel_kernels). */
     basic,
@@ -38,35 +38,35 @@ enum class kernel_invocation_type {
 };
 
 /**
- * @brief Return a list of all currently available SYCL kernel invocation types.
- * @details SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation type can be disabled during the CMake configuration.
- * @return the available SYCL kernel invocation types (`[[nodiscard]]`)
+ * @brief Return a list of all currently available SYCL data parallel kernels.
+ * @details SYCL's hierarchical data parallel kernels and AdaptiveCpp's scoped parallelism can be disabled during the CMake configuration.
+ * @return the available SYCL data parallel kernels (`[[nodiscard]]`)
  */
-[[nodiscard]] std::vector<kernel_invocation_type> list_available_sycl_kernel_invocation_types();
+[[nodiscard]] std::vector<data_parallel_kernel> list_available_sycl_data_parallel_kernels();
 
 /**
- * @brief Output the @p invocation type to the given output-stream @p out.
+ * @brief Output the @p kernel_type type to the given output-stream @p out.
  * @param[in,out] out the output-stream to write the backend type to
- * @param[in] invocation the SYCL kernel invocation type
+ * @param[in] kernel_type the SYCL data parallel kernel
  * @return the output-stream
  */
-std::ostream &operator<<(std::ostream &out, kernel_invocation_type invocation);
+std::ostream &operator<<(std::ostream &out, data_parallel_kernel kernel_type);
 
 /**
- * @brief Use the input-stream @p in to initialize the @p invocation type.
+ * @brief Use the input-stream @p in to initialize the @p kernel_type type.
  * @param[in,out] in input-stream to extract the backend type from
- * @param[in] invocation the SYCL kernel invocation type
+ * @param[in] kernel_type the SYCL data parallel kernel
  * @return the input-stream
  */
-std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation);
+std::istream &operator>>(std::istream &in, data_parallel_kernel &kernel_type);
 
 }  // namespace plssvm::sycl
 
 /// @cond Doxygen_suppress
 
 template <>
-struct fmt::formatter<plssvm::sycl::kernel_invocation_type> : fmt::ostream_formatter { };
+struct fmt::formatter<plssvm::sycl::data_parallel_kernel> : fmt::ostream_formatter { };
 
 /// @endcond
 
-#endif  // PLSSVM_BACKENDS_SYCL_KERNEL_INVOCATION_TYPE_HPP_
+#endif  // PLSSVM_BACKENDS_SYCL_DATA_PARALLEL_KERNELS_HPP_
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
index 4d19c4746..01f0de2f4 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
@@ -13,7 +13,7 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_BLAS_HPP_
 #pragma once
 
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
@@ -31,8 +31,8 @@ namespace plssvm::sycl::detail::basic {
 template <target_platform target>
 class device_kernel_symm {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -168,8 +168,8 @@ class device_kernel_symm {
 template <target_platform target>
 class device_kernel_symm_mirror {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -291,8 +291,8 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -350,8 +350,8 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
index f808c56fc..6e1c99e65 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/kernel_matrix_assembly.hpp
@@ -13,8 +13,8 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -36,8 +36,8 @@ namespace plssvm::sycl::detail::basic {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
index 627eaadbe..f1c3e8945 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/blas.hpp
@@ -13,7 +13,7 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_BLAS_HPP_
 #pragma once
 
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
@@ -31,8 +31,8 @@ namespace plssvm::sycl::detail::hierarchical {
 template <target_platform target>
 class device_kernel_symm {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -216,8 +216,8 @@ class device_kernel_symm {
 template <target_platform target>
 class device_kernel_symm_mirror {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -394,8 +394,8 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -461,8 +461,8 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
index 3bc6d0878..e6afac623 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/hierarchical/kernel_matrix_assembly.hpp
@@ -13,8 +13,8 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -37,8 +37,8 @@ namespace plssvm::sycl::detail::hierarchical {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
index 9d3d6bef8..1334e566d 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/blas.hpp
@@ -13,7 +13,7 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_BLAS_HPP_
 #pragma once
 
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
@@ -31,8 +31,8 @@ namespace plssvm::sycl::detail::scoped {
 template <target_platform target>
 class device_kernel_symm {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -206,8 +206,8 @@ class device_kernel_symm {
 template <target_platform target>
 class device_kernel_symm_mirror {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -375,8 +375,8 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -447,8 +447,8 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
index b882cdead..c2fcc5df6 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/scoped/kernel_matrix_assembly.hpp
@@ -13,8 +13,8 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -37,8 +37,8 @@ namespace plssvm::sycl::detail::scoped {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
index 5c0949c34..b179cbabe 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/blas.hpp
@@ -13,7 +13,7 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_BLAS_HPP_
 #pragma once
 
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
@@ -31,8 +31,8 @@ namespace plssvm::sycl::detail::work_group {
 template <target_platform target>
 class device_kernel_symm {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -188,8 +188,8 @@ class device_kernel_symm {
 template <target_platform target>
 class device_kernel_symm_mirror {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -339,8 +339,8 @@ class device_kernel_symm_mirror {
  */
 class device_kernel_inplace_matrix_add {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -404,8 +404,8 @@ class device_kernel_inplace_matrix_add {
  */
 class device_kernel_inplace_matrix_scale {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
index ec9fc1773..b4b836b14 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/work_group/kernel_matrix_assembly.hpp
@@ -13,8 +13,8 @@
 #define PLSSVM_BACKENDS_SYCL_CG_EXPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -36,8 +36,8 @@ namespace plssvm::sycl::detail::work_group {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
index c07186c37..1a8c71c1d 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/basic/kernel_matrix_assembly_blas.hpp
@@ -13,9 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_BASIC_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -37,8 +37,8 @@ namespace plssvm::sycl::detail::basic {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
index ea9197444..08ed85c0c 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/hierarchical/kernel_matrix_assembly_blas.hpp
@@ -13,9 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_HIERARCHICAL_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -37,8 +37,8 @@ namespace plssvm::sycl::detail::hierarchical {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
index c833b19da..d7593084b 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/scoped/kernel_matrix_assembly_blas.hpp
@@ -13,9 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_SCOPED_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -37,8 +37,8 @@ namespace plssvm::sycl::detail::scoped {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
index 509e6cb25..015268fa2 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_implicit/work_group/kernel_matrix_assembly_blas.hpp
@@ -13,9 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_CG_IMPLICIT_WORK_GROUP_KERNEL_MATRIX_ASSEMBLY_BLAS_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -37,8 +37,8 @@ namespace plssvm::sycl::detail::work_group {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
index 07d1a79dc..c2243a755 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
@@ -13,9 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_BASIC_PREDICT_KERNEL_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -35,8 +35,8 @@ namespace plssvm::sycl::detail::basic {
 template <target_platform target>
 class device_kernel_w_linear {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -147,8 +147,8 @@ class device_kernel_w_linear {
 template <target_platform target>
 class device_kernel_predict_linear {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -261,8 +261,8 @@ class device_kernel_predict_linear {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::basic;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::basic;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
index 1bb93cc3c..3eecedb19 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
@@ -13,9 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_HIERARCHICAL_PREDICT_KERNEL_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -35,8 +35,8 @@ namespace plssvm::sycl::detail::hierarchical {
 template <target_platform target>
 class device_kernel_w_linear {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -203,8 +203,8 @@ class device_kernel_w_linear {
 template <target_platform target>
 class device_kernel_predict_linear {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -375,8 +375,8 @@ class device_kernel_predict_linear {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::hierarchical;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::hierarchical;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
index a62418057..738bb1dd5 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
@@ -13,9 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_SCOPED_PREDICT_KERNEL_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -35,8 +35,8 @@ namespace plssvm::sycl::detail::scoped {
 template <target_platform target>
 class device_kernel_w_linear {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -192,8 +192,8 @@ class device_kernel_w_linear {
 template <target_platform target>
 class device_kernel_predict_linear {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -352,8 +352,8 @@ class device_kernel_predict_linear {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::scoped;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::scoped;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
index 25bec3f13..ab1bdae44 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
@@ -13,9 +13,9 @@
 #define PLSSVM_BACKENDS_SYCL_KERNEL_PREDICT_WORK_GROUP_PREDICT_KERNEL_HPP_
 #pragma once
 
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/detail/atomics.hpp"           // plssvm::sycl::detail::atomic_op
 #include "plssvm/backends/SYCL/kernel/kernel_functions.hpp"  // plssvm::sycl::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/constants.hpp"                              // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -35,8 +35,8 @@ namespace plssvm::sycl::detail::work_group {
 template <target_platform target>
 class device_kernel_w_linear {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -175,8 +175,8 @@ class device_kernel_w_linear {
 template <target_platform target>
 class device_kernel_predict_linear {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
 
     /**
      * @brief Initialize the SYCL kernel function object.
@@ -317,8 +317,8 @@ class device_kernel_predict_linear {
 template <target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
   public:
-    /// The used SYCL kernel invocation type.
-    constexpr static sycl::kernel_invocation_type invocation_type = sycl::kernel_invocation_type::work_group;
+    /// The used SYCL data parallel kernel.
+    constexpr static sycl::data_parallel_kernel data_parallel_kernel_type = sycl::data_parallel_kernel::work_group;
 
     /**
      * @brief Initialize the SYCL kernel function object.
diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp
index 6ec7773c4..726165679 100644
--- a/include/plssvm/core.hpp
+++ b/include/plssvm/core.hpp
@@ -13,35 +13,35 @@
 #define PLSSVM_CORE_HPP_
 #pragma once
 
-#include "plssvm/backend_types.hpp"                          // all supported backend types
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // the SYCL implementation type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // the SYCL specific kernel invocation typ
-#include "plssvm/classification_report.hpp"                  // reports different metrics (precision, recall, f1 score, and support) for the different classes after scoring
-#include "plssvm/classification_types.hpp"                   // all supported multi-class classification strategies
-#include "plssvm/constants.hpp"                              // verbosity flag und compile-time constants
-#include "plssvm/csvm_factory.hpp"                           // a factory function to instantiate a C-SVM using a runtime backend; includes the available backend C-SVMs
-#include "plssvm/data_set/classification_data_set.hpp"       // a classification data set used for training a C-SVC
-#include "plssvm/data_set/min_max_scaler.hpp"                // a min-max scaler for the data sets
-#include "plssvm/data_set/regression_data_set.hpp"           // a regression data set used for training a C-SVR
-#include "plssvm/environment.hpp"                            // environment management functions and classes
-#include "plssvm/exceptions/exceptions.hpp"                  // exception hierarchy
-#include "plssvm/file_format_types.hpp"                      // all supported file format types
-#include "plssvm/gamma.hpp"                                  // the types of the gamma parameter
-#include "plssvm/kernel_function_types.hpp"                  // all supported kernel function types
-#include "plssvm/kernel_functions.hpp"                       // implementation of all supported kernel functions
-#include "plssvm/matrix.hpp"                                 // a custom matrix class
-#include "plssvm/model/classification_model.hpp"             // the model as a result of training a C-SVC
-#include "plssvm/model/regression_model.hpp"                 // the model as a result of training a C-SVR
-#include "plssvm/mpi/communicator.hpp"                       // PLSSVM MPI communicator wrapper
-#include "plssvm/parameter.hpp"                              // the C-SVM parameter
-#include "plssvm/regression_report.hpp"                      // reports different metrics (e.g., mean squared error or R^2 score) for the regression task after scoring
-#include "plssvm/shape.hpp"                                  // shape for a matrix or device pointer
-#include "plssvm/solver_types.hpp"                           // all supported solver types (e.g., Conjugate Gradients with explicit, streaming, or implicit kernel matrix generation)
-#include "plssvm/svm/csvc.hpp"                               // the base C-SVC every backend is inheriting from
-#include "plssvm/svm/csvr.hpp"                               // the base C-SVR every backend is inheriting from
-#include "plssvm/target_platforms.hpp"                       // all supported target platforms
-#include "plssvm/verbosity_levels.hpp"                       // all supported verbosity levels
-#include "plssvm/version/version.hpp"                        // version information
+#include "plssvm/backend_types.hpp"                        // all supported backend types
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // the SYCL specific data parallel kernels
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // the SYCL implementation type
+#include "plssvm/classification_report.hpp"                // reports different metrics (precision, recall, f1 score, and support) for the different classes after scoring
+#include "plssvm/classification_types.hpp"                 // all supported multi-class classification strategies
+#include "plssvm/constants.hpp"                            // verbosity flag und compile-time constants
+#include "plssvm/csvm_factory.hpp"                         // a factory function to instantiate a C-SVM using a runtime backend; includes the available backend C-SVMs
+#include "plssvm/data_set/classification_data_set.hpp"     // a classification data set used for training a C-SVC
+#include "plssvm/data_set/min_max_scaler.hpp"              // a min-max scaler for the data sets
+#include "plssvm/data_set/regression_data_set.hpp"         // a regression data set used for training a C-SVR
+#include "plssvm/environment.hpp"                          // environment management functions and classes
+#include "plssvm/exceptions/exceptions.hpp"                // exception hierarchy
+#include "plssvm/file_format_types.hpp"                    // all supported file format types
+#include "plssvm/gamma.hpp"                                // the types of the gamma parameter
+#include "plssvm/kernel_function_types.hpp"                // all supported kernel function types
+#include "plssvm/kernel_functions.hpp"                     // implementation of all supported kernel functions
+#include "plssvm/matrix.hpp"                               // a custom matrix class
+#include "plssvm/model/classification_model.hpp"           // the model as a result of training a C-SVC
+#include "plssvm/model/regression_model.hpp"               // the model as a result of training a C-SVR
+#include "plssvm/mpi/communicator.hpp"                     // PLSSVM MPI communicator wrapper
+#include "plssvm/parameter.hpp"                            // the C-SVM parameter
+#include "plssvm/regression_report.hpp"                    // reports different metrics (e.g., mean squared error or R^2 score) for the regression task after scoring
+#include "plssvm/shape.hpp"                                // shape for a matrix or device pointer
+#include "plssvm/solver_types.hpp"                         // all supported solver types (e.g., Conjugate Gradients with explicit, streaming, or implicit kernel matrix generation)
+#include "plssvm/svm/csvc.hpp"                             // the base C-SVC every backend is inheriting from
+#include "plssvm/svm/csvr.hpp"                             // the base C-SVR every backend is inheriting from
+#include "plssvm/target_platforms.hpp"                     // all supported target platforms
+#include "plssvm/verbosity_levels.hpp"                     // all supported verbosity levels
+#include "plssvm/version/version.hpp"                      // version information
 
 /// The main namespace containing all public API functions.
 namespace plssvm { }
diff --git a/include/plssvm/detail/cmd/parser_predict.hpp b/include/plssvm/detail/cmd/parser_predict.hpp
index 073e92f6c..8bfa7ef3e 100644
--- a/include/plssvm/detail/cmd/parser_predict.hpp
+++ b/include/plssvm/detail/cmd/parser_predict.hpp
@@ -13,12 +13,12 @@
 #define PLSSVM_DETAIL_CMD_PARSER_PREDICT_HPP_
 #pragma once
 
-#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "fmt/base.h"     // fmt::formatter
 #include "fmt/ostream.h"  // fmt::ostream_formatter
@@ -47,8 +47,8 @@ struct parser_predict {
     /// The target platform: automatic (depending on the used backend), CPUs or GPUs from NVIDIA, AMD, or Intel.
     target_platform target{ target_platform::automatic };
 
-    /// The kernel invocation type when using SYCL as backend.
-    sycl::kernel_invocation_type sycl_kernel_invocation_type{ sycl::kernel_invocation_type::automatic };
+    /// The data parallel kernel when using SYCL as backend.
+    sycl::data_parallel_kernel sycl_data_parallel_kernel{ sycl::data_parallel_kernel::automatic };
     /// The SYCL implementation to use with `--backend sycl`.
     sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic };
 
diff --git a/include/plssvm/detail/cmd/parser_train.hpp b/include/plssvm/detail/cmd/parser_train.hpp
index 6ddae10ac..6253394a6 100644
--- a/include/plssvm/detail/cmd/parser_train.hpp
+++ b/include/plssvm/detail/cmd/parser_train.hpp
@@ -13,17 +13,17 @@
 #define PLSSVM_DETAIL_CMD_PARSER_TRAIN_HPP_
 #pragma once
 
-#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/classification_types.hpp"                   // plssvm::classification_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/parameter.hpp"                              // plssvm::parameter
-#include "plssvm/solver_types.hpp"                           // plssvm::solving_type
-#include "plssvm/svm_types.hpp"                              // plssvm::svm_type
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/classification_types.hpp"                 // plssvm::classification_type
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/parameter.hpp"                            // plssvm::parameter
+#include "plssvm/solver_types.hpp"                         // plssvm::solving_type
+#include "plssvm/svm_types.hpp"                            // plssvm::svm_type
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "fmt/base.h"     // fmt::formatter
 #include "fmt/ostream.h"  // mt::ostream_formatter
@@ -67,8 +67,8 @@ struct parser_train {
     /// The used solver type for the LS-SVM kernel matrix: automatic (depending on the available (V)RAM), cg_explicit, or cg_implicit.
     solver_type solver{ solver_type::automatic };
 
-    /// The kernel invocation type when using SYCL as backend.
-    sycl::kernel_invocation_type sycl_kernel_invocation_type{ sycl::kernel_invocation_type::automatic };
+    /// The data parallel kernel when using SYCL as backend.
+    sycl::data_parallel_kernel sycl_data_parallel_kernel{ sycl::data_parallel_kernel::automatic };
     /// The SYCL implementation to use with --backend=sycl.
     sycl::implementation_type sycl_implementation_type{ sycl::implementation_type::automatic };
 
diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp
index 378c3d3ea..42c9544ff 100644
--- a/include/plssvm/parameter.hpp
+++ b/include/plssvm/parameter.hpp
@@ -57,8 +57,8 @@ IGOR_MAKE_NAMED_ARGUMENT(solver);
 IGOR_MAKE_NAMED_ARGUMENT(classification);
 /// Create a named argument for the SYCL backend specific SYCL implementation type (DPC++ or AdaptiveCpp).
 IGOR_MAKE_NAMED_ARGUMENT(sycl_implementation_type);
-/// Create a named argument for the SYCL backend specific kernel invocation type.
-IGOR_MAKE_NAMED_ARGUMENT(sycl_kernel_invocation_type);
+/// Create a named argument for the SYCL backend specific data parallel kernels.
+IGOR_MAKE_NAMED_ARGUMENT(sycl_data_parallel_kernel);
 /// Create a named argument for the Kokkos backend specific execution space.
 IGOR_MAKE_NAMED_ARGUMENT(kokkos_execution_space);
 
@@ -76,13 +76,13 @@ constexpr bool has_only_parameter_named_args_v = !igor::has_other_than<Args...>(
  * @brief Trait to check whether @p Args only contains named-parameter that can be used to initialize a `plssvm::parameter` struct including SYCL specific named-parameters.
  */
 template <typename... Args>
-constexpr bool has_only_sycl_parameter_named_args_v = !igor::has_other_than<Args...>(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type);
+constexpr bool has_only_sycl_parameter_named_args_v = !igor::has_other_than<Args...>(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_data_parallel_kernel);
 
 /**
  * @brief Trait to check whether @p Args only contains SYCL specific named-parameters.
  */
 template <typename... Args>
-constexpr bool has_only_sycl_named_args_v = !igor::has_other_than<Args...>(plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type);
+constexpr bool has_only_sycl_named_args_v = !igor::has_other_than<Args...>(plssvm::sycl_implementation_type, plssvm::sycl_data_parallel_kernel);
 
 /**
  * @brief Trait to check whether @p Args only contains named-parameter that can be used to initialize a `plssvm::parameter` struct including Kokkos specific named-parameters.
@@ -215,7 +215,7 @@ struct parameter {
         // compile time check: each named parameter must only be passed once
         static_assert(!parser.has_duplicates(), "Can only use each named parameter once!");
         // compile time check: only some named parameters are allowed
-        static_assert(!parser.has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_kernel_invocation_type, plssvm::kokkos_execution_space),
+        static_assert(!parser.has_other_than(plssvm::kernel_type, plssvm::gamma, plssvm::degree, plssvm::coef0, plssvm::cost, plssvm::sycl_implementation_type, plssvm::sycl_data_parallel_kernel, plssvm::kokkos_execution_space),
                       "An illegal named parameter has been passed!");
 
         // shorthand function for emitting a warning if a provided parameter is not used by the current kernel function
diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index ee16daba2..70e4cdad5 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -129,7 +129,7 @@ int main(int argc, char *argv[]) {
             // create default csvm
             const std::unique_ptr<csvm_type> svm = [&]() {
                 if (use_sycl_as_backend) {
-                    return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type);
+                    return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_data_parallel_kernel = cmd_parser.sycl_data_parallel_kernel);
                 } else if (use_kokkos_as_backend) {
                     return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
                 } else {
diff --git a/src/main_train.cpp b/src/main_train.cpp
index cf4893946..924cc1191 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -163,7 +163,7 @@ int main(int argc, char *argv[]) {
             // create SVM
             const std::unique_ptr<csvm_type> svm = [&]() {
                 if (use_sycl_as_backend) {
-                    return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type);
+                    return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_data_parallel_kernel = cmd_parser.sycl_data_parallel_kernel);
                 } else if (use_kokkos_as_backend) {
                     return plssvm::make_csvm<csvm_type>(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
                 } else {
diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt b/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
index 07b18e4f4..21bff471f 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
@@ -21,7 +21,7 @@ if (PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP)
     if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS)
         message(
             WARNING "Enabled SYCL's hierarchical and AdapitveCpp's scoped kernels in AdaptiveCpp while using its SSCP compilation flow. "
-                    "SSCP, however, does currently not implement these kernel invocation types resulting in a runtime exception. "
+                    "SSCP, however, does currently not implement these data parallel kernels resulting in a runtime exception. "
                     "If you wish to use them, set \"PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP\" to \"OFF\" and use one of the legacy compilation flows. "
         )
     endif ()
diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
index c03aa46b0..370b919f6 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
@@ -13,6 +13,7 @@
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/device_ptr.hpp"                                // plssvm::adaptivecpp::detail::::device_ptr
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/queue_impl.hpp"                                // plssvm::adaptivecpp::detail::queue (PImpl implementation)
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp"                                   // plssvm::adaptivecpp::detail::{get_device_list, device_synchronize, get_adaptivecpp_version_short, get_adaptivecpp_version}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"                                        // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/exceptions.hpp"                                                   // plssvm::adaptivecpp::backend_exception
 #include "plssvm/backends/SYCL/implementation_types.hpp"                                         // plssvm::sycl::implementation_type
 #include "plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp"                                // plssvm::sycl::detail::basic::{device_kernel_symm, device_kernel_symm_mirror, device_kernel_inplace_matrix_add, device_kernel_inplace_matrix_scale}
@@ -31,7 +32,6 @@
 #include "plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp"                   // plssvm::sycl::detail::hierarchical::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
 #include "plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp"                         // plssvm::sycl::detail::scoped::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
 #include "plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp"                     // plssvm::sycl::detail::work_group::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"                                      // plssvm::kernel_invocation_type
 #include "plssvm/constants.hpp"                                                                  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                                                              // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                                   // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution}
@@ -81,35 +81,35 @@ namespace {
  */
 template <typename KernelFunctor, typename QueueType, typename... Args>
 void run_kernel_functor(const QueueType &device, const plssvm::detail::dim_type partial_grid, const plssvm::detail::dim_type block, Args &&...args) {
-    constexpr plssvm::sycl::kernel_invocation_type invocation = KernelFunctor::invocation_type;
+    constexpr plssvm::sycl::data_parallel_kernel data_parallel_kernel_type = KernelFunctor::data_parallel_kernel_type;
 
-    if constexpr (invocation == plssvm::sycl::kernel_invocation_type::basic) {
+    if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::basic) {
         device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
-            cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::basic>(partial_grid, block),
+            cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::basic>(partial_grid, block),
                              KernelFunctor{ std::forward<Args>(args)... });
         });
-    } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::work_group) {
+    } else if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::work_group) {
         device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
-            cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::work_group>(partial_grid, block),
+            cgh.parallel_for(plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::work_group>(partial_grid, block),
                              KernelFunctor{ cgh, std::forward<Args>(args)... });
         });
-    } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::hierarchical) {
+    } else if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::hierarchical) {
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
         device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
-            const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::hierarchical>(partial_grid, block);
+            const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::hierarchical>(partial_grid, block);
             cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward<Args>(args)... });
         });
 #else
-        throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+        throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
 #endif
-    } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::scoped) {
+    } else if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::scoped) {
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
         device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
-            const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::scoped>(partial_grid, block);
+            const auto exec_range = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::scoped>(partial_grid, block);
             cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward<Args>(args)... });
         });
 #else
-        throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+        throw plssvm::adaptivecpp::backend_exception{ "Support for sycl::data_parallel_kernel::scoped was disabled!" };
 #endif
     } else {
         static_assert(::plssvm::detail::always_false_v<Args...>, "Unsupported kernel function!");
@@ -245,10 +245,10 @@ void csvm::init(const target_platform target) {
         throw backend_exception{ fmt::format("SYCL backend selected but no devices for the target {} were found!", target_) };
     }
 
-    // set correct kernel invocation type if "automatic" has been provided
-    if (invocation_type_ == sycl::kernel_invocation_type::automatic) {
+    // set the correct data parallel kernel if "automatic" has been provided
+    if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::automatic) {
         // always use work_group for AdaptiveCpp
-        invocation_type_ = sycl::kernel_invocation_type::work_group;
+        data_parallel_kernel_type_ = sycl::data_parallel_kernel::work_group;
         if (target_ == target_platform::cpu) {  // TODO: set to hierarchical or scoped?!
 #if !defined(__ACPP_USE_ACCELERATED_CPU__) && defined(__ACPP_ENABLE_OMPHOST_TARGET__)
             plssvm::detail::log_untracked(verbosity_level::full | verbosity_level::warning,
@@ -266,15 +266,15 @@ void csvm::init(const target_platform target) {
             device_names.emplace_back(device.impl->sycl_queue.get_device().template get_info<::sycl::info::device::name>());
         }
 
-        mpi::detail::gather_and_print_csvm_information(comm_, plssvm::backend_type::sycl, target_, device_names, fmt::format("{}", invocation_type_));
+        mpi::detail::gather_and_print_csvm_information(comm_, plssvm::backend_type::sycl, target_, device_names, fmt::format("{}", data_parallel_kernel_type_));
     } else {
         // use more detailed single rank command line output
         plssvm::detail::log_untracked(verbosity_level::full,
                                       comm_,
-                                      "\nUsing AdaptiveCpp ({}; {}) as SYCL backend with the kernel invocation type \"{}\".\n",
+                                      "\nUsing AdaptiveCpp ({}; {}) as SYCL backend with the data parallel kernel \"{}\".\n",
                                       detail::get_adaptivecpp_version_short(),
                                       PLSSVM_ACPP_TARGETS,
-                                      invocation_type_);
+                                      data_parallel_kernel_type_);
         if (target == target_platform::automatic) {
             plssvm::detail::log_untracked(verbosity_level::full,
                                           comm_,
@@ -305,7 +305,7 @@ void csvm::init(const target_platform target) {
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "dependencies", "adaptivecpp_version", detail::get_adaptivecpp_version() }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "backend", plssvm::backend_type::sycl }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_implementation_type", plssvm::sycl::implementation_type::adaptivecpp }));
-    PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_kernel_invocation_type", invocation_type_ }));
+    PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_data_parallel_kernel", data_parallel_kernel_type_ }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "target_platform", target_ }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "num_devices", devices_.size() }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "device", device_names }));
@@ -395,20 +395,20 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
                 break;
-            case sycl::kernel_invocation_type::basic:
+            case sycl::data_parallel_kernel::basic:
                 dispatch_target_platform<sycl::detail::basic::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
                 dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
+            case sycl::data_parallel_kernel::scoped:
                 dispatch_target_platform<sycl::detail::scoped::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
         }
     }
@@ -432,19 +432,19 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 dispatch_target_platform<sycl::detail::basic::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 dispatch_target_platform<sycl::detail::work_group::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
                 dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
+            case sycl::data_parallel_kernel::scoped:
                 dispatch_target_platform<sycl::detail::scoped::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
         }
     }
@@ -453,19 +453,19 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
         const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
 
         if (num_mirror_rows > 0) {
-            switch (invocation_type_) {
-                case sycl::kernel_invocation_type::automatic:
-                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                case sycl::kernel_invocation_type::basic:
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
                     dispatch_target_platform<sycl::detail::basic::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::work_group:
+                case sycl::data_parallel_kernel::work_group:
                     dispatch_target_platform<sycl::detail::work_group::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::hierarchical:
+                case sycl::data_parallel_kernel::hierarchical:
                     dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::scoped:
+                case sycl::data_parallel_kernel::scoped:
                     dispatch_target_platform<sycl::detail::scoped::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
             }
         }
@@ -481,39 +481,39 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss
     const queue_type &device = devices_[device_id];
 
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::basic>(partial_grid_ref, exec.block),
                                      sycl::detail::basic::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::work_group>(partial_grid_ref, exec.block),
                                      sycl::detail::work_group::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::hierarchical>(partial_grid_ref, exec.block);
                     cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
 #endif
                 break;
-            case sycl::kernel_invocation_type::scoped:
+            case sycl::data_parallel_kernel::scoped:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::scoped>(partial_grid_ref, exec.block);
                     cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::scoped was disabled!" };
 #endif
                 break;
         }
@@ -526,39 +526,39 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm:
     const queue_type &device = devices_[device_id];
 
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::basic>(partial_grid_ref, exec.block),
                                      sycl::detail::basic::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::work_group>(partial_grid_ref, exec.block),
                                      sycl::detail::work_group::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::hierarchical>(partial_grid_ref, exec.block);
                     cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
 #endif
                 break;
-            case sycl::kernel_invocation_type::scoped:
+            case sycl::data_parallel_kernel::scoped:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::scoped>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::scoped>(partial_grid_ref, exec.block);
                     cgh.parallel(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::scoped::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::scoped was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::scoped was disabled!" };
 #endif
                 break;
         }
@@ -581,20 +581,20 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
                 break;
-            case sycl::kernel_invocation_type::basic:
+            case sycl::data_parallel_kernel::basic:
                 dispatch_target_platform<sycl::detail::basic::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
                 dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
+            case sycl::data_parallel_kernel::scoped:
                 dispatch_target_platform<sycl::detail::scoped::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
         }
     }
@@ -622,19 +622,19 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 dispatch_target_platform<sycl::detail::basic::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 dispatch_target_platform<sycl::detail::work_group::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
                 dispatch_target_platform<sycl::detail::hierarchical::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
+            case sycl::data_parallel_kernel::scoped:
                 dispatch_target_platform<sycl::detail::scoped::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
         }
     }
@@ -658,35 +658,35 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
         if (params.kernel_type == kernel_function_type::linear) {
-            switch (invocation_type_) {
-                case sycl::kernel_invocation_type::automatic:
-                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                case sycl::kernel_invocation_type::basic:
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
                     dispatch_target_platform<sycl::detail::basic::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::work_group:
+                case sycl::data_parallel_kernel::work_group:
                     dispatch_target_platform<sycl::detail::work_group::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::hierarchical:
+                case sycl::data_parallel_kernel::hierarchical:
                     dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::scoped:
+                case sycl::data_parallel_kernel::scoped:
                     dispatch_target_platform<sycl::detail::scoped::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
             }
         } else {
-            switch (invocation_type_) {
-                case sycl::kernel_invocation_type::automatic:
-                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                case sycl::kernel_invocation_type::basic:
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
                     dispatch_target_platform<sycl::detail::basic::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::work_group:
+                case sycl::data_parallel_kernel::work_group:
                     dispatch_target_platform<sycl::detail::work_group::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::hierarchical:
+                case sycl::data_parallel_kernel::hierarchical:
                     dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::scoped:
+                case sycl::data_parallel_kernel::scoped:
                     dispatch_target_platform<sycl::detail::scoped::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
             }
         }
diff --git a/src/plssvm/backends/SYCL/CMakeLists.txt b/src/plssvm/backends/SYCL/CMakeLists.txt
index d0de8c7f2..7fa57bf79 100644
--- a/src/plssvm/backends/SYCL/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/CMakeLists.txt
@@ -20,10 +20,10 @@ set(PLSSVM_SYCL_BACKEND_FOUND_IMPLEMENTATIONS "")
 #                                             check if SYCL can be enabled                                             #
 ########################################################################################################################
 
-# enable kernel invocation types
-option(PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS "Enables SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation types." ON)
+# enable data parallel kernels
+option(PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS "Enables SYCL's hierarchical data parallel kernel and AdaptiveCpp's scoped parallelism." ON)
 if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS)
-    message(STATUS "Enable SYCL's hierarchical and AdaptiveCpp's scoped kernel invocation types.")
+    message(STATUS "Enable SYCL's hierarchical data parallel kernel and AdaptiveCpp's scoped parallelism.")
 endif ()
 
 # add AdaptiveCpp
@@ -114,7 +114,7 @@ if (TARGET ${PLSSVM_SYCL_BACKEND_DPCPP_LIBRARY_NAME})
     )
 endif ()
 
-# add kernel invocation type compile definitions
+# add data parallel kernel compile definitions
 if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS)
     target_compile_definitions(${PLSSVM_SYCL_BACKEND_LIBRARY_NAME} INTERFACE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
     target_compile_definitions(${PLSSVM_BASE_LIBRARY_NAME} PRIVATE PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
@@ -143,7 +143,7 @@ append_local_and_parent(PLSSVM_TARGETS_TO_INSTALL ${PLSSVM_SYCL_BACKEND_LIBRARY_
 
 # set manpage strings
 set_local_and_parent(PLSSVM_SYCL_BACKEND_NAME_LIST "automatic;${PLSSVM_SYCL_BACKEND_FOUND_IMPLEMENTATIONS}")
-set_local_and_parent(PLSSVM_SYCL_KERNEL_INVOCATION_TYPE_NAME_LIST "automatic;basic;work_group${PLSSVM_SYCL_KERNEL_HIERARCHICAL_AND_SCOPED_NAME_LIST}")
+set_local_and_parent(PLSSVM_SYCL_DATA_PARALLEL_KERNEL_NAME_LIST "automatic;basic;work_group${PLSSVM_SYCL_KERNEL_HIERARCHICAL_AND_SCOPED_NAME_LIST}")
 
 # populate transformed ACPP_TARGETS for tests
 if (TARGET ${PLSSVM_SYCL_BACKEND_ADAPTIVECPP_LIBRARY_NAME})
diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
index ef987e6b6..4795cd54d 100644
--- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
@@ -10,6 +10,7 @@
 
 #include "plssvm/backend_types.hpp"                                                              // plssvm::backend_type
 #include "plssvm/backends/execution_range.hpp"                                                   // plssvm::detail::{dim_type, execution_range}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"                                        // plssvm::sycl::data_parallel_kernel
 #include "plssvm/backends/SYCL/DPCPP/detail/device_ptr.hpp"                                      // plssvm::dpcpp::detail::::device_ptr
 #include "plssvm/backends/SYCL/DPCPP/detail/queue_impl.hpp"                                      // plssvm::dpcpp::detail::queue (PImpl implementation)
 #include "plssvm/backends/SYCL/DPCPP/detail/utility.hpp"                                         // plssvm::dpcpp::detail::{get_device_list, device_synchronize, get_dpcpp_version}
@@ -27,7 +28,6 @@
 #include "plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp"                          // plssvm::sycl::detail::basic::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
 #include "plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp"                   // plssvm::sycl::detail::hierarchical::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
 #include "plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp"                     // plssvm::sycl::detail::work_group::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"                                      // plssvm::kernel_invocation_type
 #include "plssvm/constants.hpp"                                                                  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                                                              // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                                   // plssvm::detail::{data_distribution, triangular_data_distribution, rectangular_data_distribution}
@@ -77,26 +77,26 @@ namespace {
  */
 template <typename KernelFunctor, typename QueueType, typename... Args>
 void run_kernel_functor(const QueueType &device, const plssvm::detail::dim_type partial_grid, const plssvm::detail::dim_type block, Args &&...args) {
-    constexpr plssvm::sycl::kernel_invocation_type invocation = KernelFunctor::invocation_type;
+    constexpr plssvm::sycl::data_parallel_kernel data_parallel_kernel_type = KernelFunctor::data_parallel_kernel_type;
 
-    if constexpr (invocation == plssvm::sycl::kernel_invocation_type::basic) {
+    if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::basic) {
         device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
-            cgh.parallel_for(plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::basic>(partial_grid, block),
+            cgh.parallel_for(plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::basic>(partial_grid, block),
                              KernelFunctor{ std::forward<Args>(args)... });
         });
-    } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::work_group) {
+    } else if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::work_group) {
         device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
-            cgh.parallel_for(plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::work_group>(partial_grid, block),
+            cgh.parallel_for(plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::work_group>(partial_grid, block),
                              KernelFunctor{ cgh, std::forward<Args>(args)... });
         });
-    } else if constexpr (invocation == plssvm::sycl::kernel_invocation_type::hierarchical) {
+    } else if constexpr (data_parallel_kernel_type == plssvm::sycl::data_parallel_kernel::hierarchical) {
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
         device.impl->sycl_queue.submit([&](::sycl::handler &cgh) {
-            const auto exec_range = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::hierarchical>(partial_grid, block);
+            const auto exec_range = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::hierarchical>(partial_grid, block);
             cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), KernelFunctor{ std::forward<Args>(args)... });
         });
 #else
-        throw plssvm::dpcpp::backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+        throw plssvm::dpcpp::backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
 #endif
     } else {
         static_assert(::plssvm::detail::always_false_v<Args...>, "Unsupported kernel function!");
@@ -232,10 +232,10 @@ void csvm::init(const target_platform target) {
         throw backend_exception{ fmt::format("SYCL backend selected but no devices for the target {} were found!", target_) };
     }
 
-    // set correct kernel invocation type if "automatic" has been provided
-    if (invocation_type_ == sycl::kernel_invocation_type::automatic) {
+    // set the correct data parallel kernel if "automatic" has been provided
+    if (data_parallel_kernel_type_ == sycl::data_parallel_kernel::automatic) {
         // always use work_group for DPC++
-        invocation_type_ = sycl::kernel_invocation_type::work_group;
+        data_parallel_kernel_type_ = sycl::data_parallel_kernel::work_group;
     }
 
     std::vector<std::string> device_names{};
@@ -247,15 +247,15 @@ void csvm::init(const target_platform target) {
             device_names.emplace_back(device.impl->sycl_queue.get_device().template get_info<::sycl::info::device::name>());
         }
 
-        mpi::detail::gather_and_print_csvm_information(comm_, plssvm::backend_type::sycl, target_, device_names, fmt::format("{}", invocation_type_));
+        mpi::detail::gather_and_print_csvm_information(comm_, plssvm::backend_type::sycl, target_, device_names, fmt::format("{}", data_parallel_kernel_type_));
     } else {
         // use more detailed single rank command line output
         plssvm::detail::log_untracked(verbosity_level::full,
                                       comm_,
-                                      "\nUsing DPC++ ({}; {}) as SYCL backend with the kernel invocation type \"{}\".\n",
+                                      "\nUsing DPC++ ({}; {}) as SYCL backend with the data parallel kernel \"{}\".\n",
                                       detail::get_dpcpp_version(),
                                       detail::get_dpcpp_timestamp_version(),
-                                      invocation_type_);
+                                      data_parallel_kernel_type_);
         if (target == target_platform::automatic) {
             plssvm::detail::log_untracked(verbosity_level::full,
                                           comm_,
@@ -288,7 +288,7 @@ void csvm::init(const target_platform target) {
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "dependencies", "dpcpp_timestamp_version", detail::get_dpcpp_timestamp_version() }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "backend", plssvm::backend_type::sycl }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_implementation_type", plssvm::sycl::implementation_type::dpcpp }));
-    PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_kernel_invocation_type", invocation_type_ }));
+    PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "sycl_data_parallel_kernel", data_parallel_kernel_type_ }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "target_platform", target_ }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "num_devices", devices_.size() }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "device", device_names }));
@@ -372,21 +372,21 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
                 break;
-            case sycl::kernel_invocation_type::basic:
+            case sycl::data_parallel_kernel::basic:
                 dispatch_target_platform<sycl::detail::basic::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
                 dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly>(target_, params, device, partial_grid, exec.block, kernel_matrix_d.get(), data_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get(), QA_cost, cost_factor, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
-                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -409,20 +409,20 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 dispatch_target_platform<sycl::detail::basic::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 dispatch_target_platform<sycl::detail::work_group::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
                 dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm>(target_, device, partial_grid, exec.block, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
-                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
 
@@ -430,20 +430,20 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
         const unsigned long long num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
 
         if (num_mirror_rows > 0) {
-            switch (invocation_type_) {
-                case sycl::kernel_invocation_type::automatic:
-                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                case sycl::kernel_invocation_type::basic:
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
                     dispatch_target_platform<sycl::detail::basic::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::work_group:
+                case sycl::data_parallel_kernel::work_group:
                     dispatch_target_platform<sycl::detail::work_group::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::hierarchical:
+                case sycl::data_parallel_kernel::hierarchical:
                     dispatch_target_platform<sycl::detail::hierarchical::device_kernel_symm_mirror>(target_, device, partial_grid, exec.block, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get(), B_d.get(), beta, C_d.get(), offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::scoped:
-                    throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                case sycl::data_parallel_kernel::scoped:
+                    throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
             }
         }
     }
@@ -458,33 +458,33 @@ void csvm::run_inplace_matrix_addition(const std::size_t device_id, const ::plss
     const queue_type &device = devices_[device_id];
 
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::basic>(partial_grid_ref, exec.block),
                                      sycl::detail::basic::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::work_group>(partial_grid_ref, exec.block),
                                      sycl::detail::work_group::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::hierarchical>(partial_grid_ref, exec.block);
                     cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_add{ num_rhs, lhs_d.get(), rhs_d.get(), offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
 #endif
                 break;
-            case sycl::kernel_invocation_type::scoped:
-                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -495,33 +495,33 @@ void csvm::run_inplace_matrix_scale(const std::size_t device_id, const ::plssvm:
     const queue_type &device = devices_[device_id];
 
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::basic>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::basic>(partial_grid_ref, exec.block),
                                      sycl::detail::basic::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    cgh.parallel_for(detail::get_execution_range<sycl::kernel_invocation_type::work_group>(partial_grid_ref, exec.block),
+                    cgh.parallel_for(detail::get_execution_range<sycl::data_parallel_kernel::work_group>(partial_grid_ref, exec.block),
                                      sycl::detail::work_group::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
                 device.impl->sycl_queue.submit([&, &partial_grid_ref = partial_grid, &offsets_ref = offsets](::sycl::handler &cgh) {
-                    const auto exec_range = detail::get_execution_range<sycl::kernel_invocation_type::hierarchical>(partial_grid_ref, exec.block);
+                    const auto exec_range = detail::get_execution_range<sycl::data_parallel_kernel::hierarchical>(partial_grid_ref, exec.block);
                     cgh.parallel_for_work_group(exec_range.get_global_range(), exec_range.get_local_range(), sycl::detail::hierarchical::device_kernel_inplace_matrix_scale{ num_rhs, lhs_d.get(), scale, offsets_ref.y, offsets_ref.x });
                 });
 #else
-                throw backend_exception{ "Support for sycl::kernel_invocation_type::hierarchical was disabled!" };
+                throw backend_exception{ "Support for sycl::data_parallel_kernel::hierarchical was disabled!" };
 #endif
                 break;
-            case sycl::kernel_invocation_type::scoped:
-                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -542,21 +542,21 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
                 break;
-            case sycl::kernel_invocation_type::basic:
+            case sycl::data_parallel_kernel::basic:
                 dispatch_target_platform<sycl::detail::basic::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 dispatch_target_platform<sycl::detail::work_group::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
                 dispatch_target_platform<sycl::detail::hierarchical::device_kernel_assembly_symm>(target_, params, device, partial_grid, exec.block, alpha, q_red.get(), A_d.get(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get(), C_d.get(), num_classes, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
-                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -583,20 +583,20 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
 
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
-        switch (invocation_type_) {
-            case sycl::kernel_invocation_type::automatic:
-                throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-            case sycl::kernel_invocation_type::basic:
+        switch (data_parallel_kernel_type_) {
+            case sycl::data_parallel_kernel::automatic:
+                throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+            case sycl::data_parallel_kernel::basic:
                 dispatch_target_platform<sycl::detail::basic::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::work_group:
+            case sycl::data_parallel_kernel::work_group:
                 dispatch_target_platform<sycl::detail::work_group::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::hierarchical:
+            case sycl::data_parallel_kernel::hierarchical:
                 dispatch_target_platform<sycl::detail::hierarchical::device_kernel_w_linear>(target_, device, partial_grid, exec.block, w_d.get(), alpha_d.get(), sv_d.get(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.y, offsets.x);
                 break;
-            case sycl::kernel_invocation_type::scoped:
-                throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+            case sycl::data_parallel_kernel::scoped:
+                throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
         }
     }
     detail::device_synchronize(device);
@@ -619,36 +619,36 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
     const auto start = std::chrono::steady_clock::now();
     for (const auto &[partial_grid, offsets] : exec.grids) {
         if (params.kernel_type == kernel_function_type::linear) {
-            switch (invocation_type_) {
-                case sycl::kernel_invocation_type::automatic:
-                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                case sycl::kernel_invocation_type::basic:
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
                     dispatch_target_platform<sycl::detail::basic::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::work_group:
+                case sycl::data_parallel_kernel::work_group:
                     dispatch_target_platform<sycl::detail::work_group::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::hierarchical:
+                case sycl::data_parallel_kernel::hierarchical:
                     dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict_linear>(target_, device, partial_grid, exec.block, out_d.get(), sv_or_w_d.get(), rho_d.get(), predict_points_d.get(), num_classes, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::scoped:
-                    throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                case sycl::data_parallel_kernel::scoped:
+                    throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
             }
         } else {
-            switch (invocation_type_) {
-                case sycl::kernel_invocation_type::automatic:
-                    throw backend_exception{ "Can't determine the sycl::kernel_invocation_type!" };
-                case sycl::kernel_invocation_type::basic:
+            switch (data_parallel_kernel_type_) {
+                case sycl::data_parallel_kernel::automatic:
+                    throw backend_exception{ "Can't determine the sycl::data_parallel_kernel!" };
+                case sycl::data_parallel_kernel::basic:
                     dispatch_target_platform<sycl::detail::basic::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::work_group:
+                case sycl::data_parallel_kernel::work_group:
                     dispatch_target_platform<sycl::detail::work_group::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::hierarchical:
+                case sycl::data_parallel_kernel::hierarchical:
                     dispatch_target_platform<sycl::detail::hierarchical::device_kernel_predict>(target_, params, device, partial_grid, exec.block, out_d.get(), alpha_d.get(), rho_d.get(), sv_or_w_d.get(), predict_points_d.get(), num_classes, num_sv, num_predict_points, num_features, offsets.y, offsets.x);
                     break;
-                case sycl::kernel_invocation_type::scoped:
-                    throw backend_exception{ "Can't use the sycl::kernel_invocation_type::scoped with DPC++!" };
+                case sycl::data_parallel_kernel::scoped:
+                    throw backend_exception{ "Can't use the sycl::data_parallel_kernel::scoped with DPC++!" };
             }
         }
     }
diff --git a/src/plssvm/backends/SYCL/kernel_invocation_types.cpp b/src/plssvm/backends/SYCL/data_parallel_kernels.cpp
similarity index 52%
rename from src/plssvm/backends/SYCL/kernel_invocation_types.cpp
rename to src/plssvm/backends/SYCL/data_parallel_kernels.cpp
index 87ee18f26..ed098d335 100644
--- a/src/plssvm/backends/SYCL/kernel_invocation_types.cpp
+++ b/src/plssvm/backends/SYCL/data_parallel_kernels.cpp
@@ -6,7 +6,7 @@
  *          See the LICENSE.md file in the project root for full license information.
  */
 
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"
 
 #include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
 
@@ -18,52 +18,52 @@
 
 namespace plssvm::sycl {
 
-std::vector<kernel_invocation_type> list_available_sycl_kernel_invocation_types() {
-    std::vector<kernel_invocation_type> available_sycl_kernel_invocation_types = {
-        kernel_invocation_type::automatic,
-        kernel_invocation_type::basic,
-        kernel_invocation_type::work_group
+std::vector<data_parallel_kernel> list_available_sycl_data_parallel_kernels() {
+    std::vector<data_parallel_kernel> available_sycl_data_parallel_kernels = {
+        data_parallel_kernel::automatic,
+        data_parallel_kernel::basic,
+        data_parallel_kernel::work_group
     };
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-    available_sycl_kernel_invocation_types.push_back(kernel_invocation_type::hierarchical);
+    available_sycl_data_parallel_kernels.push_back(data_parallel_kernel::hierarchical);
     #if defined(PLSSVM_SYCL_BACKEND_HAS_ADAPTIVECPP)
-    available_sycl_kernel_invocation_types.push_back(kernel_invocation_type::scoped);
+    available_sycl_data_parallel_kernels.push_back(data_parallel_kernel::scoped);
     #endif
 #endif
-    return available_sycl_kernel_invocation_types;
+    return available_sycl_data_parallel_kernels;
 }
 
-std::ostream &operator<<(std::ostream &out, const kernel_invocation_type invocation) {
-    switch (invocation) {
-        case kernel_invocation_type::automatic:
+std::ostream &operator<<(std::ostream &out, const data_parallel_kernel kernel_type) {
+    switch (kernel_type) {
+        case data_parallel_kernel::automatic:
             return out << "automatic";
-        case kernel_invocation_type::basic:
+        case data_parallel_kernel::basic:
             return out << "basic";
-        case kernel_invocation_type::work_group:
+        case data_parallel_kernel::work_group:
             return out << "work_group";
-        case kernel_invocation_type::hierarchical:
+        case data_parallel_kernel::hierarchical:
             return out << "hierarchical";
-        case kernel_invocation_type::scoped:
+        case data_parallel_kernel::scoped:
             return out << "scoped";
     }
     return out << "unknown";
 }
 
-std::istream &operator>>(std::istream &in, kernel_invocation_type &invocation) {
+std::istream &operator>>(std::istream &in, data_parallel_kernel &kernel_type) {
     std::string str;
     in >> str;
     detail::to_lower_case(str);
 
     if (str == "automatic" || str == "auto") {
-        invocation = kernel_invocation_type::automatic;
+        kernel_type = data_parallel_kernel::automatic;
     } else if (str == "basic") {
-        invocation = kernel_invocation_type::basic;
+        kernel_type = data_parallel_kernel::basic;
     } else if (str == "work_group" || str == "work-group" || str == "nd_range" || str == "nd-range") {
-        invocation = kernel_invocation_type::work_group;
+        kernel_type = data_parallel_kernel::work_group;
     } else if (str == "hierarchical") {
-        invocation = kernel_invocation_type::hierarchical;
+        kernel_type = data_parallel_kernel::hierarchical;
     } else if (str == "scoped") {
-        invocation = kernel_invocation_type::scoped;
+        kernel_type = data_parallel_kernel::scoped;
     } else {
         in.setstate(std::ios::failbit);
     }
diff --git a/src/plssvm/detail/cmd/parser_predict.cpp b/src/plssvm/detail/cmd/parser_predict.cpp
index ed4476e96..cbe64e12f 100644
--- a/src/plssvm/detail/cmd/parser_predict.cpp
+++ b/src/plssvm/detail/cmd/parser_predict.cpp
@@ -8,18 +8,18 @@
 
 #include "plssvm/detail/cmd/parser_predict.hpp"
 
-#include "plssvm/backend_types.hpp"                          // plssvm::list_available_backends
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::list_available_execution_spaces
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::list_available_sycl_implementations
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::{list_available_sycl_kernel_invocation_types, kernel_invocation_type}
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/detail/assert.hpp"                          // PLSSVM_ASSERT
-#include "plssvm/detail/logging/mpi_log_untracked.hpp"       // plssvm::detail::log_untracked
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::cmd_parser_exit
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/target_platforms.hpp"                       // plssvm::list_available_target_platforms
-#include "plssvm/verbosity_levels.hpp"                       // plssvm::verbosity, plssvm::verbosity_level
-#include "plssvm/version/version.hpp"                        // plssvm::version::detail::get_version_info
+#include "plssvm/backend_types.hpp"                        // plssvm::list_available_backends
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::list_available_execution_spaces
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::{list_available_sycl_data_parallel_kernels, data_parallel_kernels}
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::list_available_sycl_implementations
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
+#include "plssvm/detail/logging/mpi_log_untracked.hpp"     // plssvm::detail::log_untracked
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/target_platforms.hpp"                     // plssvm::list_available_target_platforms
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity, plssvm::verbosity_level
+#include "plssvm/version/version.hpp"                      // plssvm::version::detail::get_version_info
 
 #include "cxxopts.hpp"   // cxxopts::{Options, value, ParseResult}
 #include "fmt/color.h"   // fmt::fg, fmt::color::orange
@@ -53,7 +53,7 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
             ("b,backend", fmt::format("choose the backend: {}", fmt::join(list_available_backends(), "|")), cxxopts::value<backend_type>()->default_value(fmt::format("{}", backend)))
             ("p,target_platform", fmt::format("choose the target platform: {}", fmt::join(list_available_target_platforms(), "|")), cxxopts::value<target_platform>()->default_value(fmt::format("{}", target)))
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-            ("sycl_kernel_invocation_type", fmt::format("choose the kernel invocation type when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_kernel_invocation_types(), "|")), cxxopts::value<decltype(sycl_kernel_invocation_type)>()->default_value(fmt::format("{}", sycl_kernel_invocation_type)))
+            ("sycl_data_parallel_kernel", fmt::format("choose the data parallel kernel when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_data_parallel_kernels(), "|")), cxxopts::value<decltype(sycl_data_parallel_kernel)>()->default_value(fmt::format("{}", sycl_data_parallel_kernel)))
             ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value<sycl::implementation_type>()->default_value(fmt::format("{}", sycl_implementation_type)))
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -121,19 +121,19 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
 
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     {
-        // parse kernel invocation type when using SYCL as backend
-        sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as<decltype(sycl_kernel_invocation_type)>();
+        // parse the data parallel kernel when using SYCL as backend
+        sycl_data_parallel_kernel = result["sycl_data_parallel_kernel"].as<decltype(sycl_data_parallel_kernel)>();
 
         // assemble warning condition
         const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
         const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
 
-        // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend
-        if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) {
+        // warn if the data parallel kernel is explicitly set but SYCL isn't the current (automatic) backend
+        if (!sycl_backend_is_used && sycl_data_parallel_kernel != sycl::data_parallel_kernel::automatic) {
             detail::log_untracked(verbosity_level::full | verbosity_level::warning,
                                   comm,
-                                  "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n",
-                                  sycl_kernel_invocation_type);
+                                  "WARNING: explicitly set a SYCL data parallel kernel but the current backend isn't SYCL; ignoring --sycl_data_parallel_kernel={}\n",
+                                  sycl_data_parallel_kernel);
         }
 
         // parse SYCL implementation used in the SYCL backend
@@ -252,9 +252,9 @@ std::ostream &operator<<(std::ostream &out, const parser_predict &params) {
     if (params.backend == backend_type::sycl || params.backend == backend_type::automatic) {
         out << fmt::format(
             "SYCL implementation type: {}\n"
-            "SYCL kernel invocation type: {}\n",
+            "SYCL data parallel kernel: {}\n",
             params.sycl_implementation_type,
-            params.sycl_kernel_invocation_type);
+            params.sycl_data_parallel_kernel);
     }
 
     if (params.backend == backend_type::kokkos || params.backend == backend_type::automatic) {
diff --git a/src/plssvm/detail/cmd/parser_train.cpp b/src/plssvm/detail/cmd/parser_train.cpp
index b47422a0f..288aa3c1e 100644
--- a/src/plssvm/detail/cmd/parser_train.cpp
+++ b/src/plssvm/detail/cmd/parser_train.cpp
@@ -8,24 +8,24 @@
 
 #include "plssvm/detail/cmd/parser_train.hpp"
 
-#include "plssvm/backend_types.hpp"                          // plssvm::list_available_backends, plssvm::determine_default_backend
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::{list_available_execution_spaces, execution_space}
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::{list_available_sycl_implementations, implementation_type}
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::{list_available_sycl_kernel_invocation_types, kernel_invocation_type}
-#include "plssvm/classification_types.hpp"                   // plssvm::classification_type, plssvm::classification_type_to_full_string
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/detail/assert.hpp"                          // PLSSVM_ASSERT
-#include "plssvm/detail/logging/mpi_log_untracked.hpp"       // plssvm::detail::log_untracked
-#include "plssvm/detail/utility.hpp"                         // plssvm::detail::to_underlying
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::cmd_parser_exit
-#include "plssvm/gamma.hpp"                                  // plssvm::get_gamma_string
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_type_to_math_string
-#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
-#include "plssvm/mpi/environment.hpp"                        // plssvm::mpi::{is_active, finalize}
-#include "plssvm/svm_types.hpp"                              // plssvm::svm_type
-#include "plssvm/target_platforms.hpp"                       // plssvm::list_available_target_platforms
-#include "plssvm/verbosity_levels.hpp"                       // plssvm::verbosity, plssvm::verbosity_level
-#include "plssvm/version/version.hpp"                        // plssvm::version::detail::get_version_info
+#include "plssvm/backend_types.hpp"                        // plssvm::list_available_backends, plssvm::determine_default_backend
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::{list_available_execution_spaces, execution_space}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::{list_available_sycl_data_parallel_kernels, data_parallel_kernels}
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::{list_available_sycl_implementations, implementation_type}
+#include "plssvm/classification_types.hpp"                 // plssvm::classification_type, plssvm::classification_type_to_full_string
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
+#include "plssvm/detail/logging/mpi_log_untracked.hpp"     // plssvm::detail::log_untracked
+#include "plssvm/detail/utility.hpp"                       // plssvm::detail::to_underlying
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
+#include "plssvm/gamma.hpp"                                // plssvm::get_gamma_string
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_type_to_math_string
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/mpi/environment.hpp"                      // plssvm::mpi::{is_active, finalize}
+#include "plssvm/svm_types.hpp"                            // plssvm::svm_type
+#include "plssvm/target_platforms.hpp"                     // plssvm::list_available_target_platforms
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity, plssvm::verbosity_level
+#include "plssvm/version/version.hpp"                      // plssvm::version::detail::get_version_info
 
 #include "cxxopts.hpp"   // cxxopts::Options, cxxopts::value,cxxopts::ParseResult
 #include "fmt/color.h"   // fmt::fg, fmt::color::red
@@ -80,7 +80,7 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
            ("b,backend", fmt::format("choose the backend: {}", fmt::join(list_available_backends(), "|")), cxxopts::value<decltype(backend)>()->default_value(fmt::format("{}", backend)))
            ("p,target_platform", fmt::format("choose the target platform: {}", fmt::join(list_available_target_platforms(), "|")), cxxopts::value<decltype(target)>()->default_value(fmt::format("{}", target)))
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-           ("sycl_kernel_invocation_type", fmt::format("choose the kernel invocation type when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_kernel_invocation_types(), "|")), cxxopts::value<decltype(sycl_kernel_invocation_type)>()->default_value(fmt::format("{}", sycl_kernel_invocation_type)))
+           ("sycl_data_parallel_kernel", fmt::format("choose the data parallel kernel when using SYCL as backend: {}", fmt::join(sycl::list_available_sycl_data_parallel_kernels(), "|")), cxxopts::value<decltype(sycl_data_parallel_kernel)>()->default_value(fmt::format("{}", sycl_data_parallel_kernel)))
            ("sycl_implementation_type", fmt::format("choose the SYCL implementation to be used in the SYCL backend: {}", fmt::join(sycl::list_available_sycl_implementations(), "|")), cxxopts::value<decltype(sycl_implementation_type)>()->default_value(fmt::format("{}", sycl_implementation_type)))
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -223,19 +223,19 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
 
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     {
-        // parse kernel invocation type when using SYCL as backend
-        sycl_kernel_invocation_type = result["sycl_kernel_invocation_type"].as<decltype(sycl_kernel_invocation_type)>();
+        // parse data parallel kernel when using SYCL as backend
+        sycl_data_parallel_kernel = result["sycl_data_parallel_kernel"].as<decltype(sycl_data_parallel_kernel)>();
 
         // assemble warning condition
         const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
         const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
 
-        // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend
-        if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) {
+        // warn if the data parallel kernel is explicitly set but SYCL isn't the current (automatic) backend
+        if (!sycl_backend_is_used && sycl_data_parallel_kernel != sycl::data_parallel_kernel::automatic) {
             detail::log_untracked(verbosity_level::full | verbosity_level::warning,
                                   comm,
-                                  "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n",
-                                  sycl_kernel_invocation_type);
+                                  "WARNING: explicitly set a SYCL data parallel kernel but the current backend isn't SYCL; ignoring --sycl_data_parallel_kernel={}\n",
+                                  sycl_data_parallel_kernel);
         }
 
         // parse SYCL implementation used in the SYCL backend
@@ -387,9 +387,9 @@ std::ostream &operator<<(std::ostream &out, const parser_train &params) {
     if (params.backend == backend_type::sycl || params.backend == backend_type::automatic) {
         out << fmt::format(
             "SYCL implementation type: {}\n"
-            "SYCL kernel invocation type: {}\n",
+            "SYCL data parallel kernel: {}\n",
             params.sycl_implementation_type,
-            params.sycl_kernel_invocation_type);
+            params.sycl_data_parallel_kernel);
     }
 
     if (params.backend == backend_type::kokkos || params.backend == backend_type::automatic) {
diff --git a/src/plssvm/detail/tracking/performance_tracker.cpp b/src/plssvm/detail/tracking/performance_tracker.cpp
index 8598367dc..4a48e1fba 100644
--- a/src/plssvm/detail/tracking/performance_tracker.cpp
+++ b/src/plssvm/detail/tracking/performance_tracker.cpp
@@ -136,7 +136,7 @@ void performance_tracker::add_tracking_entry(const tracking_entry<cmd::parser_tr
         tracking_entries_[entry.entry_category].emplace("classification_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.classification) });
         tracking_entries_[entry.entry_category].emplace("backend", std::vector<std::string>{ fmt::format("{}", entry.entry_value.backend) });
         tracking_entries_[entry.entry_category].emplace("target", std::vector<std::string>{ fmt::format("{}", entry.entry_value.target) });
-        tracking_entries_[entry.entry_category].emplace("sycl_kernel_invocation_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_kernel_invocation_type) });
+        tracking_entries_[entry.entry_category].emplace("sycl_data_parallel_kernel", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_data_parallel_kernel) });
         tracking_entries_[entry.entry_category].emplace("sycl_implementation_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_implementation_type) });
         tracking_entries_[entry.entry_category].emplace("kokkos_execution_space", std::vector<std::string>{ fmt::format("{}", entry.entry_value.kokkos_execution_space) });
         tracking_entries_[entry.entry_category].emplace("strings_as_labels", std::vector<std::string>{ fmt::format("{}", entry.entry_value.strings_as_labels) });
@@ -155,6 +155,7 @@ void performance_tracker::add_tracking_entry(const tracking_entry<cmd::parser_pr
         tracking_entries_[entry.entry_category].emplace("task", std::vector<std::string>{ "predict" });
         tracking_entries_[entry.entry_category].emplace("backend", std::vector<std::string>{ fmt::format("{}", entry.entry_value.backend) });
         tracking_entries_[entry.entry_category].emplace("target", std::vector<std::string>{ fmt::format("{}", entry.entry_value.target) });
+        tracking_entries_[entry.entry_category].emplace("sycl_data_parallel_kernel", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_data_parallel_kernel) });
         tracking_entries_[entry.entry_category].emplace("sycl_implementation_type", std::vector<std::string>{ fmt::format("{}", entry.entry_value.sycl_implementation_type) });
         tracking_entries_[entry.entry_category].emplace("kokkos_execution_space", std::vector<std::string>{ fmt::format("{}", entry.entry_value.kokkos_execution_space) });
         tracking_entries_[entry.entry_category].emplace("strings_as_labels", std::vector<std::string>{ fmt::format("{}", entry.entry_value.strings_as_labels) });
@@ -253,16 +254,16 @@ void performance_tracker::save(std::ostream &out) {
     constexpr std::string_view hostname{ "not available" };
     constexpr std::string_view username{ "not available" };
 #endif
-    // check whether asserts are enabled
+    // check if asserts are enabled
     constexpr bool assert_enabled = PLSSVM_IS_DEFINED(PLSSVM_ENABLE_ASSERTS);
-    // check whether LTO has been enabled
+    // check if LTO has been enabled
     constexpr bool lto_enabled = PLSSVM_IS_DEFINED(PLSSVM_LTO_SUPPORTED);
-    // check whether fast-math has been enabled
+    // check if fast-math has been enabled
     constexpr bool fast_math_enabled = PLSSVM_IS_DEFINED(PLSSVM_USE_FAST_MATH);
-    // check whether the maximum allocatable memory size should be enforced
+    // check if the maximum allocatable memory size should be enforced
     constexpr bool enforce_max_mem_alloc_size = PLSSVM_IS_DEFINED(PLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE);
 
-    // begin a new YAML document (only with "---" multiple YAML docments in a single file are allowed)
+    // begin a new YAML document (only with "---" multiple YAML documents in a single file are allowed)
     out << "---\n";
 
     // output metadata information
@@ -361,7 +362,7 @@ void performance_tracker::save(std::ostream &out) {
 
     out << "dependencies:\n";
 
-    // calculate the number of padding whitespaces for the dependencies category
+    // calculate the number of padding whitespaces for the "dependencies" category
     std::size_t max_dependency_entry_name_length = 18;  // fast_float_version
     if (detail::contains(tracking_entries_, "dependencies")) {
         for (const auto &[entry_name, entry_value] : tracking_entries_["dependencies"]) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e7be2758e..3f830bd76 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -162,10 +162,10 @@ set(PLSSVM_BASE_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/backends/execution_range.cpp
     # since the Kokkos execution_space enumeration is used even if no Kokkos backend is available this is also tested in the base library
     ${CMAKE_CURRENT_LIST_DIR}/backends/Kokkos/execution_space.cpp
-    # since the SYCL implementation_type and kernel_invocation_type enumerations are used even if no SYCL backend is available these are also tested in the base
+    # since the SYCL implementation_type and data_parallel_kernel enumerations are used even if no SYCL backend is available these are also tested in the base
     # library
+    ${CMAKE_CURRENT_LIST_DIR}/backends/SYCL/data_parallel_kernels.cpp
     ${CMAKE_CURRENT_LIST_DIR}/backends/SYCL/implementation_types.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/backends/SYCL/kernel_invocation_types.cpp
     # since the stdpar implementation_type enumeration is used even if no stdpar backend is available this is also tested in the base library
     ${CMAKE_CURRENT_LIST_DIR}/backends/stdpar/implementation_types.cpp
     ${CMAKE_CURRENT_LIST_DIR}/data_set/classification/constructors.cpp
diff --git a/tests/backends/SYCL/AdaptiveCpp/CMakeLists.txt b/tests/backends/SYCL/AdaptiveCpp/CMakeLists.txt
index 31967bce4..5de76dd8a 100644
--- a/tests/backends/SYCL/AdaptiveCpp/CMakeLists.txt
+++ b/tests/backends/SYCL/AdaptiveCpp/CMakeLists.txt
@@ -15,10 +15,10 @@ set(PLSSVM_SYCL_ADAPTIVECPP_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/adaptivecpp_csvm.cpp
     # ~~~
-    # since the SYCL implementation_type and kernel_invocation_type enumerations are used even if no SYCL backend
+    # since the SYCL implementation_type and data_parallel_kernel enumerations are used even if no SYCL backend
     # is available these are also tested in the base library
     # ${CMAKE_CURRENT_LIST_DIR}/../implementation_types.cpp
-    # ${CMAKE_CURRENT_LIST_DIR}/../kernel_invocation_types.cpp
+    # ${CMAKE_CURRENT_LIST_DIR}/../data_parallel_kernels.cpp
     # ~~~
 )
 
diff --git a/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp b/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp
index 446374ef6..c1a031979 100644
--- a/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp
+++ b/tests/backends/SYCL/AdaptiveCpp/adaptivecpp_csvm.cpp
@@ -8,14 +8,14 @@
  * @brief Tests for the functionality related to the SYCL backend using AdaptiveCpp as SYCL implementation.
  */
 
-#include "plssvm/backend_types.hpp"                          // plssvm::csvm_to_backend_type_v
-#include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp"         // plssvm::adaptivecpp::{csvm, csvc, csvr}
-#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::adaptivecpp::backend_exception
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/detail/arithmetic_type_name.hpp"            // plssvm::detail::arithmetic_type_name
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_kernel_invocation_type
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::csvm_to_backend_type_v
+#include "plssvm/backends/SYCL/AdaptiveCpp/csvm.hpp"       // plssvm::adaptivecpp::{csvm, csvc, csvr}
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/exceptions.hpp"             // plssvm::adaptivecpp::backend_exception
+#include "plssvm/detail/arithmetic_type_name.hpp"          // plssvm::detail::arithmetic_type_name
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/parameter.hpp"                            // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_data_parallel_kernel
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "tests/backends/generic_base_csvc_tests.hpp"                 // generic C-SVC tests to instantiate
 #include "tests/backends/generic_base_csvm_tests.hpp"                 // generic C-SVM tests to instantiate
@@ -49,7 +49,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, default_construct) {
 
     // default constructor must always work
     EXPECT_NO_THROW(csvm_type{});
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(AdaptiveCppCSVMConstructor, construct_parameter) {
@@ -57,7 +57,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_parameter) {
 
     // the automatic target platform must always be available
     EXPECT_NO_THROW(csvm_type{ plssvm::parameter{} });
-    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_parameter) {
@@ -69,33 +69,33 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_parameter) {
     // every target is allowed for SYCL
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
@@ -107,7 +107,7 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_named_args) {
     // every target is allowed for SYCL
     EXPECT_NO_THROW((csvm_type{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_named_args) {
@@ -117,81 +117,81 @@ TYPED_TEST(AdaptiveCppCSVMConstructor, construct_target_and_named_args) {
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::adaptivecpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 }
 
-TYPED_TEST(AdaptiveCppCSVMConstructor, get_kernel_invocation_type) {
+TYPED_TEST(AdaptiveCppCSVMConstructor, get_data_parallel_kernel) {
     using csvm_type = typename TestFixture::fixture_csvm_type;
 
     // construct default C-SVM
     const csvm_type svm{ plssvm::parameter{} };
 
-    // after construction: get_kernel_invocation_type must refer to a plssvm::sycl::kernel_invocation_type that is not automatic
-    EXPECT_NE(svm.get_kernel_invocation_type(), plssvm::sycl::kernel_invocation_type::automatic);
+    // after construction: get_data_parallel_kernel must refer to a plssvm::sycl::data_parallel_kernel that is not automatic
+    EXPECT_NE(svm.get_data_parallel_kernel(), plssvm::sycl::data_parallel_kernel::automatic);
 }
 
-template <bool mock_grid_size, plssvm::sycl::kernel_invocation_type invocation_type>
+template <bool mock_grid_size, plssvm::sycl::data_parallel_kernel data_parallel_kernel_type>
 struct adaptivecpp_csvm_test_type {
     using mock_csvm_type = mock_adaptivecpp_csvm<mock_grid_size>;
     using csvm_type = plssvm::adaptivecpp::csvm;
     using csvc_type = plssvm::adaptivecpp::csvc;
     using csvr_type = plssvm::adaptivecpp::csvr;
     using device_ptr_type = typename csvm_type::device_ptr_type;
-    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_kernel_invocation_type, invocation_type));
+    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_data_parallel_kernel, data_parallel_kernel_type));
 };
 
 // a tuple containing the test structs
 using adaptivecpp_csvm_test_tuple = std::tuple<
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::hierarchical>,
-    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::scoped>,
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::hierarchical>,
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::scoped>,
 #endif
-    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::basic>,
-    adaptivecpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::work_group>>;
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::basic>,
+    adaptivecpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::work_group>>;
 
 // the tests used in the instantiated GTest test suites
 // general test types
@@ -239,11 +239,11 @@ INSTANTIATE_TYPED_TEST_SUITE_P(AdaptiveCppCSVMDeathTest, GenericGPUCSVMDeathTest
 
 using adaptivecpp_mock_csvm_test_tuple = std::tuple<
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::hierarchical>,
-    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::scoped>,
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::hierarchical>,
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::scoped>,
 #endif
-    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::basic>,
-    adaptivecpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::work_group>>;
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::basic>,
+    adaptivecpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::work_group>>;
 
 using adaptivecpp_mock_csvm_test_type_list = util::cartesian_type_product_t<adaptivecpp_mock_csvm_test_tuple>;
 
diff --git a/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp b/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp
index e909fa2a9..ffc6efd11 100644
--- a/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp
+++ b/tests/backends/SYCL/AdaptiveCpp/detail/utility.cpp
@@ -11,7 +11,7 @@
 #include "plssvm/backends/SYCL/AdaptiveCpp/detail/utility.hpp"
 
 #include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range
@@ -67,7 +67,7 @@ TEST(AdaptiveCppUtility, get_execution_range_basic) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::basic>(grid, block);
+    const ::sycl::range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::basic>(grid, block);
 
     EXPECT_EQ(exec, (sycl::range<2>{ 512ull, 512ull }));
 }
@@ -78,7 +78,7 @@ TEST(AdaptiveCppUtility, get_execution_range_work_group) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::work_group>(grid, block);
+    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::work_group>(grid, block);
 
     EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 512ull, 512ull }, ::sycl::range<2>{ 8ull, 8ull } }));
 }
@@ -89,7 +89,7 @@ TEST(AdaptiveCppUtility, get_execution_range_hierarchical) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::hierarchical>(grid, block);
+    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::hierarchical>(grid, block);
 
     EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } }));
 }
@@ -100,7 +100,7 @@ TEST(AdaptiveCppUtility, get_execution_range_scoped) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::scoped>(grid, block);
+    const ::sycl::nd_range exec = plssvm::adaptivecpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::scoped>(grid, block);
 
     EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } }));
 }
diff --git a/tests/backends/SYCL/DPCPP/CMakeLists.txt b/tests/backends/SYCL/DPCPP/CMakeLists.txt
index e36545f08..7c599b595 100644
--- a/tests/backends/SYCL/DPCPP/CMakeLists.txt
+++ b/tests/backends/SYCL/DPCPP/CMakeLists.txt
@@ -15,10 +15,10 @@ set(PLSSVM_SYCL_DPCPP_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/dpcpp_csvm.cpp
     # ~~~
-    # since the SYCL implementation_type and kernel_invocation_type enumerations are used even if no SYCL backend
+    # since the SYCL implementation_type and data_parallel_kernel enumerations are used even if no SYCL backend
     # is available these are also tested in the base library
     # ${CMAKE_CURRENT_LIST_DIR}/../implementation_types.cpp
-    # ${CMAKE_CURRENT_LIST_DIR}/../kernel_invocation_types.cpp
+    # ${CMAKE_CURRENT_LIST_DIR}/../data_parallel_kernels.cpp
     # ~~~
 )
 
diff --git a/tests/backends/SYCL/DPCPP/detail/utility.cpp b/tests/backends/SYCL/DPCPP/detail/utility.cpp
index 84b2d60f9..d9b4d83ec 100644
--- a/tests/backends/SYCL/DPCPP/detail/utility.cpp
+++ b/tests/backends/SYCL/DPCPP/detail/utility.cpp
@@ -11,7 +11,7 @@
 #include "plssvm/backends/SYCL/DPCPP/detail/utility.hpp"
 
 #include "plssvm/backends/execution_range.hpp"               // plssvm::detail::dim_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"    // plssvm::sycl::data_parallel_kernel
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
 
 #include "sycl/sycl.hpp"  // sycl::range, sycl::nd_range
@@ -66,7 +66,7 @@ TEST(DPCPPUtility, get_execution_range_basic) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::basic>(grid, block);
+    const ::sycl::range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::basic>(grid, block);
 
     EXPECT_EQ(exec, (sycl::range<2>{ 512ull, 512ull }));
 }
@@ -77,7 +77,7 @@ TEST(DPCPPUtility, get_execution_range_work_group) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::work_group>(grid, block);
+    const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::work_group>(grid, block);
 
     EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 512ull, 512ull }, ::sycl::range<2>{ 8ull, 8ull } }));
 }
@@ -88,7 +88,7 @@ TEST(DPCPPUtility, get_execution_range_hierarchical) {
     const plssvm::detail::dim_type block{ 8ull, 8ull };
 
     // calculate the SYCL execution range
-    const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::kernel_invocation_type::hierarchical>(grid, block);
+    const ::sycl::nd_range exec = plssvm::dpcpp::detail::get_execution_range<plssvm::sycl::data_parallel_kernel::hierarchical>(grid, block);
 
     EXPECT_EQ(exec, (::sycl::nd_range<2>{ ::sycl::range<2>{ 64ull, 64ull }, ::sycl::range<2>{ 8ull, 8ull } }));
 }
diff --git a/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp b/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp
index ada2f4b56..c3987bf75 100644
--- a/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp
+++ b/tests/backends/SYCL/DPCPP/dpcpp_csvm.cpp
@@ -8,14 +8,14 @@
  * @brief Tests for the functionality related to the SYCL backend using DPC++ as SYCL implementation.
  */
 
-#include "plssvm/backend_types.hpp"                          // plssvm::csvm_to_backend_type_v
-#include "plssvm/backends/SYCL/DPCPP/csvm.hpp"               // plssvm::dpcpp::{csvm, csvc, csvr}
-#include "plssvm/backends/SYCL/exceptions.hpp"               // plssvm::dpcpp::backend_exception
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/detail/arithmetic_type_name.hpp"            // plssvm::detail::arithmetic_type_name
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/parameter.hpp"                              // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_kernel_invocation_type
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
+#include "plssvm/backend_types.hpp"                        // plssvm::csvm_to_backend_type_v
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/DPCPP/csvm.hpp"             // plssvm::dpcpp::{csvm, csvc, csvr}
+#include "plssvm/backends/SYCL/exceptions.hpp"             // plssvm::dpcpp::backend_exception
+#include "plssvm/detail/arithmetic_type_name.hpp"          // plssvm::detail::arithmetic_type_name
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/parameter.hpp"                            // plssvm::parameter, plssvm::kernel_type, plssvm::cost, plssvm::sycl_data_parallel_kernel
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
 
 #include "tests/backends/generic_base_csvc_tests.hpp"     // generic C-SVC tests to instantiate
 #include "tests/backends/generic_base_csvm_tests.hpp"     // generic C-SVM tests to instantiate
@@ -49,7 +49,7 @@ TYPED_TEST(DPCPPCSVMConstructor, default_construct) {
 
     // default constructor must always work
     EXPECT_NO_THROW(csvm_type{});
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(DPCPPCSVMConstructor, construct_parameter) {
@@ -57,7 +57,7 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_parameter) {
 
     // the automatic target platform must always be available
     EXPECT_NO_THROW(csvm_type{ plssvm::parameter{} });
-    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::parameter{}, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_parameter) {
@@ -69,33 +69,33 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_parameter) {
     // every target is allowed for SYCL
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
-    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+    EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel, params, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
@@ -107,7 +107,7 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_named_args) {
     // every target is allowed for SYCL
     EXPECT_NO_THROW((csvm_type{ plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 }
 
 TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_named_args) {
@@ -117,80 +117,80 @@ TYPED_TEST(DPCPPCSVMConstructor, construct_target_and_named_args) {
 #if defined(PLSSVM_HAS_CPU_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::cpu, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::cpu,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'cpu' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_NVIDIA_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_nvidia, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_nvidia,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_nvidia' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_AMD_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_amd, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_amd,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_amd' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 #if defined(PLSSVM_HAS_INTEL_TARGET)
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::kernel_type = plssvm::kernel_function_type::linear, plssvm::cost = 2.0 }));
     EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::cost = 2.0 }));
-    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }));
+    EXPECT_NO_THROW((csvm_type{ plssvm::target_platform::gpu_intel, plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }));
 #else
     EXPECT_THROW_WHAT((csvm_type{ plssvm::target_platform::gpu_intel,
                                   plssvm::kernel_type = plssvm::kernel_function_type::linear,
                                   plssvm::cost = 2.0,
-                                  plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group }),
+                                  plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group }),
                       plssvm::dpcpp::backend_exception,
                       "Requested target platform 'gpu_intel' that hasn't been enabled using PLSSVM_TARGET_PLATFORMS!");
 #endif
 }
 
-TYPED_TEST(DPCPPCSVMConstructor, get_kernel_invocation_type) {
+TYPED_TEST(DPCPPCSVMConstructor, get_data_parallel_kernel) {
     using csvm_type = typename TestFixture::fixture_csvm_type;
 
     // construct default C-SVM
     const csvm_type svm{ plssvm::parameter{} };
 
-    // after construction: get_kernel_invocation_type must refer to a plssvm::sycl::kernel_invocation_type that is not automatic
-    EXPECT_NE(svm.get_kernel_invocation_type(), plssvm::sycl::kernel_invocation_type::automatic);
+    // after construction: get_data_parallel_kernel must refer to a plssvm::sycl::data_parallel_kernel that is not automatic
+    EXPECT_NE(svm.get_data_parallel_kernel(), plssvm::sycl::data_parallel_kernel::automatic);
 }
 
-template <bool mock_grid_size, plssvm::sycl::kernel_invocation_type invocation_type>
+template <bool mock_grid_size, plssvm::sycl::data_parallel_kernel data_parallel_kernel_type>
 struct dpcpp_csvm_test_type {
     using mock_csvm_type = mock_dpcpp_csvm<mock_grid_size>;
     using csvm_type = plssvm::dpcpp::csvm;
     using csvc_type = plssvm::dpcpp::csvc;
     using csvr_type = plssvm::dpcpp::csvr;
     using device_ptr_type = typename csvm_type::device_ptr_type;
-    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_kernel_invocation_type, invocation_type));
+    inline static auto additional_arguments = std::make_tuple(std::make_pair(plssvm::sycl_data_parallel_kernel, data_parallel_kernel_type));
 };
 
 // a tuple containing the test structs
 using dpcpp_csvm_test_tuple = std::tuple<
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-    dpcpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::hierarchical>,
+    dpcpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::hierarchical>,
 #endif
-    dpcpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::basic>,
-    dpcpp_csvm_test_type<false, plssvm::sycl::kernel_invocation_type::work_group>>;
+    dpcpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::basic>,
+    dpcpp_csvm_test_type<false, plssvm::sycl::data_parallel_kernel::work_group>>;
 
 // the tests used in the instantiated GTest test suites
 // general test types
@@ -238,10 +238,10 @@ INSTANTIATE_TYPED_TEST_SUITE_P(DPCPPCSVMDeathTest, GenericGPUCSVMDeathTest, dpcp
 
 using dpcpp_mock_csvm_test_tuple = std::tuple<
 #if defined(PLSSVM_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS_ENABLED)
-    dpcpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::hierarchical>,
+    dpcpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::hierarchical>,
 #endif
-    dpcpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::basic>,
-    dpcpp_csvm_test_type<true, plssvm::sycl::kernel_invocation_type::work_group>>;
+    dpcpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::basic>,
+    dpcpp_csvm_test_type<true, plssvm::sycl::data_parallel_kernel::work_group>>;
 
 using dpcpp_mock_csvm_test_type_list = util::cartesian_type_product_t<dpcpp_mock_csvm_test_tuple>;
 
diff --git a/tests/backends/SYCL/data_parallel_kernels.cpp b/tests/backends/SYCL/data_parallel_kernels.cpp
new file mode 100644
index 000000000..e1fbaeff4
--- /dev/null
+++ b/tests/backends/SYCL/data_parallel_kernels.cpp
@@ -0,0 +1,72 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for functions related to the different SYCL data parallel kernels.
+ */
+
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::{data_parallel_kernel, list_available_sycl_data_parallel_kernels}
+
+#include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
+
+#include "gmock/gmock.h"  // EXPECT_THAT; ::testing::Contains
+#include "gtest/gtest.h"  // TEST, EXPECT_TRUE, EXPECT_GE
+
+#include <sstream>  // std::istringstream
+
+// check whether the plssvm::sycl::data_parallel_kernel -> std::string conversions are correct
+TEST(SYCLDataParallelKernel, to_string) {
+    // check conversions to std::string
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::data_parallel_kernel::automatic, "automatic");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::data_parallel_kernel::basic, "basic");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::data_parallel_kernel::work_group, "work_group");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::data_parallel_kernel::hierarchical, "hierarchical");
+    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::data_parallel_kernel::scoped, "scoped");
+}
+
+TEST(SYCLDataParallelKernel, to_string_unknown) {
+    // check conversions to std::string from unknown file_format_type
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::sycl::data_parallel_kernel>(5), "unknown");
+}
+
+// check whether the std::string -> plssvm::sycl::data_parallel_kernel conversions are correct
+TEST(SYCLDataParallelKernel, from_string) {
+    // check conversion from std::string
+    EXPECT_CONVERSION_FROM_STRING("automatic", plssvm::sycl::data_parallel_kernel::automatic);
+    EXPECT_CONVERSION_FROM_STRING("AUTOMATIC", plssvm::sycl::data_parallel_kernel::automatic);
+    EXPECT_CONVERSION_FROM_STRING("auto", plssvm::sycl::data_parallel_kernel::automatic);
+    EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::sycl::data_parallel_kernel::automatic);
+    EXPECT_CONVERSION_FROM_STRING("basic", plssvm::sycl::data_parallel_kernel::basic);
+    EXPECT_CONVERSION_FROM_STRING("BASIC", plssvm::sycl::data_parallel_kernel::basic);
+    EXPECT_CONVERSION_FROM_STRING("work_group", plssvm::sycl::data_parallel_kernel::work_group);
+    EXPECT_CONVERSION_FROM_STRING("WORK-GROUP", plssvm::sycl::data_parallel_kernel::work_group);
+    EXPECT_CONVERSION_FROM_STRING("nd_range", plssvm::sycl::data_parallel_kernel::work_group);
+    EXPECT_CONVERSION_FROM_STRING("ND-RANGE", plssvm::sycl::data_parallel_kernel::work_group);
+    EXPECT_CONVERSION_FROM_STRING("hierarchical", plssvm::sycl::data_parallel_kernel::hierarchical);
+    EXPECT_CONVERSION_FROM_STRING("HIERARCHICAL", plssvm::sycl::data_parallel_kernel::hierarchical);
+    EXPECT_CONVERSION_FROM_STRING("scoped", plssvm::sycl::data_parallel_kernel::scoped);
+    EXPECT_CONVERSION_FROM_STRING("SCOPED", plssvm::sycl::data_parallel_kernel::scoped);
+}
+
+TEST(SYCLDataParallelKernel, from_string_unknown) {
+    // foo isn't a valid file_format_type
+    std::istringstream input{ "foo" };
+    plssvm::sycl::data_parallel_kernel data_parallel_kernel_type{};
+    input >> data_parallel_kernel_type;
+    EXPECT_TRUE(input.fail());
+}
+
+TEST(SYCLDataParallelKernel, minimal_available_sycl_data_parallel_kernels) {
+    const std::vector<plssvm::sycl::data_parallel_kernel> data_parallel_kernel_types = plssvm::sycl::list_available_sycl_data_parallel_kernels();
+
+    // at least three must be available (automatic, basic, and work_group)!
+    EXPECT_GE(data_parallel_kernel_types.size(), 3);
+
+    // check for the data parallel kernels that must always be present
+    EXPECT_THAT(data_parallel_kernel_types, ::testing::Contains(plssvm::sycl::data_parallel_kernel::automatic));
+    EXPECT_THAT(data_parallel_kernel_types, ::testing::Contains(plssvm::sycl::data_parallel_kernel::basic));
+    EXPECT_THAT(data_parallel_kernel_types, ::testing::Contains(plssvm::sycl::data_parallel_kernel::work_group));
+}
diff --git a/tests/backends/SYCL/kernel_invocation_types.cpp b/tests/backends/SYCL/kernel_invocation_types.cpp
deleted file mode 100644
index 3227cb077..000000000
--- a/tests/backends/SYCL/kernel_invocation_types.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * @author Alexander Van Craen
- * @author Marcel Breyer
- * @copyright 2018-today The PLSSVM project - All Rights Reserved
- * @license This file is part of the PLSSVM project which is released under the MIT license.
- *          See the LICENSE.md file in the project root for full license information.
- *
- * @brief Tests for functions related to the different SYCL kernel invocation types.
- */
-
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-
-#include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
-
-#include "gmock/gmock.h"  // EXPECT_THAT; ::testing::Contains
-#include "gtest/gtest.h"  // TEST, EXPECT_TRUE, EXPECT_GE
-
-#include <sstream>  // std::istringstream
-
-// check whether the plssvm::sycl::kernel_invocation_type -> std::string conversions are correct
-TEST(SYCLKernelInvocationType, to_string) {
-    // check conversions to std::string
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::automatic, "automatic");
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::basic, "basic");
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::work_group, "work_group");
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::hierarchical, "hierarchical");
-    EXPECT_CONVERSION_TO_STRING(plssvm::sycl::kernel_invocation_type::scoped, "scoped");
-}
-
-TEST(SYCLKernelInvocationType, to_string_unknown) {
-    // check conversions to std::string from unknown file_format_type
-    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::sycl::kernel_invocation_type>(5), "unknown");
-}
-
-// check whether the std::string -> plssvm::sycl::kernel_invocation_type conversions are correct
-TEST(SYCLKernelInvocationType, from_string) {
-    // check conversion from std::string
-    EXPECT_CONVERSION_FROM_STRING("automatic", plssvm::sycl::kernel_invocation_type::automatic);
-    EXPECT_CONVERSION_FROM_STRING("AUTOMATIC", plssvm::sycl::kernel_invocation_type::automatic);
-    EXPECT_CONVERSION_FROM_STRING("auto", plssvm::sycl::kernel_invocation_type::automatic);
-    EXPECT_CONVERSION_FROM_STRING("AUTO", plssvm::sycl::kernel_invocation_type::automatic);
-    EXPECT_CONVERSION_FROM_STRING("basic", plssvm::sycl::kernel_invocation_type::basic);
-    EXPECT_CONVERSION_FROM_STRING("BASIC", plssvm::sycl::kernel_invocation_type::basic);
-    EXPECT_CONVERSION_FROM_STRING("work_group", plssvm::sycl::kernel_invocation_type::work_group);
-    EXPECT_CONVERSION_FROM_STRING("WORK-GROUP", plssvm::sycl::kernel_invocation_type::work_group);
-    EXPECT_CONVERSION_FROM_STRING("nd_range", plssvm::sycl::kernel_invocation_type::work_group);
-    EXPECT_CONVERSION_FROM_STRING("ND-RANGE", plssvm::sycl::kernel_invocation_type::work_group);
-    EXPECT_CONVERSION_FROM_STRING("hierarchical", plssvm::sycl::kernel_invocation_type::hierarchical);
-    EXPECT_CONVERSION_FROM_STRING("HIERARCHICAL", plssvm::sycl::kernel_invocation_type::hierarchical);
-    EXPECT_CONVERSION_FROM_STRING("scoped", plssvm::sycl::kernel_invocation_type::scoped);
-    EXPECT_CONVERSION_FROM_STRING("SCOPED", plssvm::sycl::kernel_invocation_type::scoped);
-}
-
-TEST(SYCLKernelInvocationType, from_string_unknown) {
-    // foo isn't a valid file_format_type
-    std::istringstream input{ "foo" };
-    plssvm::sycl::kernel_invocation_type invocation_type{};
-    input >> invocation_type;
-    EXPECT_TRUE(input.fail());
-}
-
-TEST(SYCLKernelInvocationType, minimal_available_sycl_kernel_invocation_types) {
-    const std::vector<plssvm::sycl::kernel_invocation_type> invocation_type = plssvm::sycl::list_available_sycl_kernel_invocation_types();
-
-    // at least three must be available (automatic, basic, and work_group)!
-    EXPECT_GE(invocation_type.size(), 3);
-
-    // check for the kernel invocation types that must always be present
-    EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::automatic));
-    EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::basic));
-    EXPECT_THAT(invocation_type, ::testing::Contains(plssvm::sycl::kernel_invocation_type::work_group));
-}
diff --git a/tests/detail/cmd/parser_predict.cpp b/tests/detail/cmd/parser_predict.cpp
index 8365494b7..8e5e83758 100644
--- a/tests/detail/cmd/parser_predict.cpp
+++ b/tests/detail/cmd/parser_predict.cpp
@@ -10,14 +10,14 @@
 
 #include "plssvm/detail/cmd/parser_predict.hpp"
 
-#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::cmd_parser_exit
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
-#include "plssvm/verbosity_levels.hpp"                       // plssvm::verbosity
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity
 
 #include "tests/custom_test_macros.hpp"      // EXPECT_CONVERSION_TO_STRING, EXPECT_THROW_WHAT
 #include "tests/detail/cmd/cmd_utility.hpp"  // util::ParameterBase
@@ -48,7 +48,7 @@ TEST_F(ParserPredict, minimal) {
     // check parsed values
     EXPECT_EQ(parser.backend, plssvm::backend_type::automatic);
     EXPECT_EQ(parser.target, plssvm::target_platform::automatic);
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
     EXPECT_EQ(parser.kokkos_execution_space, plssvm::kokkos::execution_space::automatic);
     EXPECT_FALSE(parser.strings_as_labels);
@@ -72,7 +72,7 @@ TEST_F(ParserPredict, minimal_output) {
         "backend: automatic\n"
         "target platform: automatic\n"
         "SYCL implementation type: automatic\n"
-        "SYCL kernel invocation type: automatic\n"
+        "SYCL data parallel kernel: automatic\n"
         "Kokkos execution space: automatic\n"
         "label_type: int (default)\n"
         "real_type: {}\n"
@@ -90,7 +90,7 @@ TEST_F(ParserPredict, all_arguments) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-predict", "--backend", "cuda", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_data_parallel_kernel", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
@@ -112,10 +112,10 @@ TEST_F(ParserPredict, all_arguments) {
     EXPECT_EQ(parser.backend, plssvm::backend_type::cuda);
     EXPECT_EQ(parser.target, plssvm::target_platform::gpu_nvidia);
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::work_group);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::work_group);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::dpcpp);
 #else
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -143,7 +143,7 @@ TEST_F(ParserPredict, all_arguments_output) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-predict", "--backend", "automatic", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_data_parallel_kernel", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
@@ -168,10 +168,10 @@ TEST_F(ParserPredict, all_arguments_output) {
     };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     correct += "SYCL implementation type: dpcpp\n"
-               "SYCL kernel invocation type: work_group\n";
+               "SYCL data parallel kernel: work_group\n";
 #else
     correct += "SYCL implementation type: automatic\n"
-               "SYCL kernel invocation type: automatic\n";
+               "SYCL data parallel kernel: automatic\n";
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     correct += fmt::format("Kokkos execution space: {}\n", space);
@@ -244,26 +244,26 @@ INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictTargetPlatform, ::testing::
 
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
 
-class ParserPredictSYCLKernelInvocation : public ParserPredict,
-                                          public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
+class ParserPredictSYCLDataParallelKernel : public ParserPredict,
+                                            public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
 
-TEST_P(ParserPredictSYCLKernelInvocation, parsing) {
+TEST_P(ParserPredictSYCLDataParallelKernel, parsing) {
     const auto &[flag, value] = GetParam();
-    // convert string to sycl::kernel_invocation_type
-    const auto sycl_kernel_invocation_type = util::convert_from_string<plssvm::sycl::kernel_invocation_type>(value);
+    // convert string to sycl::data_parallel_kernel
+    const auto sycl_data_parallel_kernel = util::convert_from_string<plssvm::sycl::data_parallel_kernel>(value);
     // create artificial command line arguments in test fixture
-    this->CreateCMDArgs({ "./plssvm-predict", flag, value, "data.libsvm" });
+    this->CreateCMDArgs({ "./plssvm-predict", flag, value, "data.libsvm", "data.libsvm.model" });
     // create parameter object
     const plssvm::detail::cmd::parser_predict parser{ this->get_comm(), this->get_argc(), this->get_argv() };
     // test for correctness
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, sycl_kernel_invocation_type);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, sycl_data_parallel_kernel);
 }
 
 // clang-format off
-INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictSYCLKernelInvocation, ::testing::Combine(
-                ::testing::Values("--sycl_kernel_invocation_type"),
+INSTANTIATE_TEST_SUITE_P(ParserPredict, ParserPredictSYCLDataParallelKernel, ::testing::Combine(
+                ::testing::Values("--sycl_data_parallel_kernel"),
                 ::testing::Values("automatic", "auto", "basic", "nd_range", "work_group", "hierarchical", "scoped")),
-                naming::pretty_print_parameter_flag_and_value<ParserPredictSYCLKernelInvocation>);
+                naming::pretty_print_parameter_flag_and_value<ParserPredictSYCLDataParallelKernel>);
 // clang-format on
 
 class ParserPredictSYCLImplementation : public ParserPredict,
diff --git a/tests/detail/cmd/parser_train.cpp b/tests/detail/cmd/parser_train.cpp
index 36e70228c..6e766de42 100644
--- a/tests/detail/cmd/parser_train.cpp
+++ b/tests/detail/cmd/parser_train.cpp
@@ -10,19 +10,19 @@
 
 #include "plssvm/detail/cmd/parser_train.hpp"
 
-#include "plssvm/backend_types.hpp"                          // plssvm::backend_type
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/classification_types.hpp"                   // plssvm::classification_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/exceptions/exceptions.hpp"                  // plssvm::cmd_parser_exit
-#include "plssvm/gamma.hpp"                                  // plssvm::gamma_type
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
-#include "plssvm/solver_types.hpp"                           // plssvm::solver_type
-#include "plssvm/svm_types.hpp"                              // plssvm::svm_type
-#include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
-#include "plssvm/verbosity_levels.hpp"                       // plssvm::verbosity
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/classification_types.hpp"                 // plssvm::classification_type
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
+#include "plssvm/gamma.hpp"                                // plssvm::gamma_type
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/solver_types.hpp"                         // plssvm::solver_type
+#include "plssvm/svm_types.hpp"                            // plssvm::svm_type
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity
 
 #include "tests/custom_test_macros.hpp"      // EXPECT_CONVERSION_TO_STRING, EXPECT_THROW_WHAT
 #include "tests/detail/cmd/cmd_utility.hpp"  // util::ParameterBase
@@ -61,7 +61,7 @@ TEST_F(ParserTrain, minimal) {
     EXPECT_EQ(parser.backend, plssvm::backend_type::automatic);
     EXPECT_EQ(parser.target, plssvm::target_platform::automatic);
     EXPECT_EQ(parser.solver, plssvm::solver_type::automatic);
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
     EXPECT_EQ(parser.kokkos_execution_space, plssvm::kokkos::execution_space::automatic);
     EXPECT_FALSE(parser.strings_as_labels);
@@ -91,7 +91,7 @@ TEST_F(ParserTrain, minimal_output) {
         "target platform: automatic\n"
         "solver: automatic\n"
         "SYCL implementation type: automatic\n"
-        "SYCL kernel invocation type: automatic\n"
+        "SYCL data parallel kernel: automatic\n"
         "Kokkos execution space: automatic\n"
         "classification_type: one vs. all\n"
         "label_type: int\n"
@@ -108,7 +108,7 @@ TEST_F(ParserTrain, all_arguments) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-train", "--svm_type", "1", "--kernel_type", "1", "--degree", "2", "--gamma", "1.5", "--coef0", "-1.5", "--cost", "2", "--epsilon", "1e-12", "--max_iter", "100", "--classification", "oao", "--solver", "cg_implicit", "--backend", "cuda", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_data_parallel_kernel", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const plssvm::kokkos::execution_space space = plssvm::kokkos::list_available_execution_spaces()[1];  // [0] would be automatic
@@ -142,10 +142,10 @@ TEST_F(ParserTrain, all_arguments) {
     EXPECT_EQ(parser.target, plssvm::target_platform::gpu_nvidia);
     EXPECT_EQ(parser.solver, plssvm::solver_type::cg_implicit);
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::work_group);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::work_group);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::dpcpp);
 #else
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, plssvm::sycl::kernel_invocation_type::automatic);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, plssvm::sycl::data_parallel_kernel::automatic);
     EXPECT_EQ(parser.sycl_implementation_type, plssvm::sycl::implementation_type::automatic);
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
@@ -170,7 +170,7 @@ TEST_F(ParserTrain, all_arguments_output) {
     // create artificial command line arguments in test fixture
     std::vector<std::string> cmd_args = { "./plssvm-train", "--svm_type", "1", "--kernel_type", "1", "--degree", "2", "--gamma", "1.5", "--coef0", "-1.5", "--cost", "2", "--epsilon", "1e-12", "--max_iter", "100", "--classification", "oao", "--solver", "cg_implicit", "--backend", "automatic", "--target_platform", "gpu_nvidia", "--use_strings_as_labels", "--verbosity", "libsvm" };
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
-    cmd_args.insert(cmd_args.end(), { "--sycl_kernel_invocation_type", "work_group", "--sycl_implementation_type", "dpcpp" });
+    cmd_args.insert(cmd_args.end(), { "--sycl_data_parallel_kernel", "work_group", "--sycl_implementation_type", "dpcpp" });
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     const std::string space = fmt::format("{}", plssvm::kokkos::list_available_execution_spaces()[1]);  // [0] would be automatic
@@ -203,10 +203,10 @@ TEST_F(ParserTrain, all_arguments_output) {
         "solver: cg_implicit\n";
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
     correct += "SYCL implementation type: dpcpp\n"
-               "SYCL kernel invocation type: work_group\n";
+               "SYCL data parallel kernel: work_group\n";
 #else
     correct += "SYCL implementation type: automatic\n"
-               "SYCL kernel invocation type: automatic\n";
+               "SYCL data parallel kernel: automatic\n";
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     correct += fmt::format("Kokkos execution space: {}\n", space);
@@ -530,26 +530,26 @@ INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainTargetPlatform, ::testing::Comb
 
 #if defined(PLSSVM_HAS_SYCL_BACKEND)
 
-class ParserTrainSYCLKernelInvocation : public ParserTrain,
-                                        public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
+class ParserTrainSYCLDataParallelKernel : public ParserTrain,
+                                          public ::testing::WithParamInterface<std::tuple<std::string, std::string>> { };
 
-TEST_P(ParserTrainSYCLKernelInvocation, parsing) {
+TEST_P(ParserTrainSYCLDataParallelKernel, parsing) {
     const auto &[flag, value] = GetParam();
-    // convert string to sycl::kernel_invocation_type
-    const auto sycl_kernel_invocation_type = util::convert_from_string<plssvm::sycl::kernel_invocation_type>(value);
+    // convert string to sycl::data_parallel_kernel
+    const auto sycl_data_parallel_kernel = util::convert_from_string<plssvm::sycl::data_parallel_kernel>(value);
     // create artificial command line arguments in test fixture
     this->CreateCMDArgs({ "./plssvm-train", flag, value, "data.libsvm" });
     // create parameter object
     const plssvm::detail::cmd::parser_train parser{ this->get_comm(), this->get_argc(), this->get_argv() };
     // test for correctness
-    EXPECT_EQ(parser.sycl_kernel_invocation_type, sycl_kernel_invocation_type);
+    EXPECT_EQ(parser.sycl_data_parallel_kernel, sycl_data_parallel_kernel);
 }
 
 // clang-format off
-INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainSYCLKernelInvocation, ::testing::Combine(
-                ::testing::Values("--sycl_kernel_invocation_type"),
+INSTANTIATE_TEST_SUITE_P(ParserTrain, ParserTrainSYCLDataParallelKernel, ::testing::Combine(
+                ::testing::Values("--sycl_data_parallel_kernel"),
                 ::testing::Values("automatic", "auto", "basic", "nd_range", "work_group", "hierarchical", "scoped")),
-                naming::pretty_print_parameter_flag_and_value<ParserTrainSYCLKernelInvocation>);
+                naming::pretty_print_parameter_flag_and_value<ParserTrainSYCLDataParallelKernel>);
 // clang-format on
 
 class ParserTrainSYCLImplementation : public ParserTrain,
@@ -826,7 +826,7 @@ TEST_P(ParserTrainOutput, parsing) {
         "target platform: automatic\n"
         "solver: automatic\n"
         "SYCL implementation type: automatic\n"
-        "SYCL kernel invocation type: automatic\n"
+        "SYCL data parallel kernel: automatic\n"
         "Kokkos execution space: automatic\n"
         "classification_type: one vs. all\n"
         "label_type: int\n"
diff --git a/tests/detail/tracking/performance_tracker.cpp b/tests/detail/tracking/performance_tracker.cpp
index 31a0c3d83..9d7f6fd47 100644
--- a/tests/detail/tracking/performance_tracker.cpp
+++ b/tests/detail/tracking/performance_tracker.cpp
@@ -488,7 +488,7 @@ TEST_F(PerformanceTracker, add_parser_predict_tracking_entry) {
     // check entries for correctness
     EXPECT_EQ(entries.size(), 1);
 
-    ASSERT_EQ(entries.at("parameter").size(), 10);
+    ASSERT_EQ(entries.at("parameter").size(), 11);
 }
 
 TEST_F(PerformanceTracker, add_parser_scale_tracking_entry) {
diff --git a/tests/parameter.cpp b/tests/parameter.cpp
index 940aa7d08..ef8a8d957 100644
--- a/tests/parameter.cpp
+++ b/tests/parameter.cpp
@@ -10,13 +10,13 @@
 
 #include "plssvm/parameter.hpp"
 
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::execution_space
-#include "plssvm/backends/SYCL/implementation_types.hpp"     // plssvm::sycl::implementation_type
-#include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
-#include "plssvm/constants.hpp"                              // plssvm::real_type
-#include "plssvm/detail/arithmetic_type_name.hpp"            // plssvm::detail::arithmetic_type_name
-#include "plssvm/gamma.hpp"                                  // plssvm::gamma_coefficient_type
-#include "plssvm/kernel_function_types.hpp"                  // plssvm::kernel_function_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernel
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/constants.hpp"                            // plssvm::real_type
+#include "plssvm/detail/arithmetic_type_name.hpp"          // plssvm::detail::arithmetic_type_name
+#include "plssvm/gamma.hpp"                                // plssvm::gamma_coefficient_type
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
 
 #include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_FLOATING_POINT_EQ
 
@@ -99,7 +99,7 @@ TEST(Parameter, construct_parameter_and_named_args) {
     const plssvm::parameter param{ param_base,
                                    plssvm::kernel_type = plssvm::kernel_function_type::rbf,
                                    plssvm::sycl_implementation_type = plssvm::sycl::implementation_type::adaptivecpp,
-                                   plssvm::sycl_kernel_invocation_type = plssvm::sycl::kernel_invocation_type::work_group,
+                                   plssvm::sycl_data_parallel_kernel = plssvm::sycl::data_parallel_kernel::work_group,
                                    plssvm::kokkos_execution_space = plssvm::kokkos::execution_space::cuda };
 
     // test default values
diff --git a/utility_scripts/performance_analysis.py b/utility_scripts/performance_analysis.py
index 48a5cb179..b1942104a 100644
--- a/utility_scripts/performance_analysis.py
+++ b/utility_scripts/performance_analysis.py
@@ -112,7 +112,7 @@ def fit_model_with_timeout(csvm, data, eps):
 
         if backend == plssvm.BackendType.SYCL:
             # special case SYCL backend
-            # add all available SYCL implementation and both kernel invocation types
+            # add all available SYCL implementation types and data parallel kernels
             available_sycl_implementations = plssvm.sycl.list_available_sycl_implementations()
             available_sycl_implementations.reverse()
             for sycl_impl in available_sycl_implementations:
@@ -120,7 +120,7 @@ def fit_model_with_timeout(csvm, data, eps):
                 if sycl_impl == plssvm.sycl.ImplementationType.AUTOMATIC:
                     continue
                 available_backends.append((backend, { "sycl_implementation_type":    sycl_impl,
-                                                      "sycl_kernel_invocation_type": plssvm.sycl.KernelInvocationType.WORK_GROUP }))
+                                                      "sycl_data_parallel_kernel": plssvm.sycl.DataParallelKernel.WORK_GROUP }))
         else:
             available_backends.append((backend, { }))
 

From 63f8b6bffe3060b01158268dbb14fc581da7a6f0 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 30 Jun 2025 15:13:19 +0200
Subject: [PATCH 58/93] Add missing include.

---
 src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp | 1 +
 src/plssvm/backends/SYCL/DPCPP/csvm.cpp       | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
index 370b919f6..cdb1ebc90 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/csvm.cpp
@@ -64,6 +64,7 @@
 #include <limits>     // std::numeric_limits::max
 #include <string>     // std::string
 #include <tuple>      // std::tie
+#include <utility>    // std::forward
 #include <variant>    // std::get
 #include <vector>     // std::vector
 
diff --git a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
index 4795cd54d..d90ef9408 100644
--- a/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
+++ b/src/plssvm/backends/SYCL/DPCPP/csvm.cpp
@@ -60,6 +60,7 @@
 #include <string>       // std::string
 #include <string_view>  // std::string_view
 #include <tuple>        // std::tie
+#include <utility>      // std::forward
 #include <variant>      // std::get
 #include <vector>       // std::vector
 

From ed5d12427b04e157fdbf93dd94ded551d1e0c40a Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 30 Jun 2025 20:00:54 +0200
Subject: [PATCH 59/93] Update the Kokkos backend kernels. Now: some parts of
 the kernels are specialized for the CPU for better performance.

---
 .../Kokkos/kernel/cg_explicit/blas.hpp        | 357 ++++++++------
 .../cg_explicit/kernel_matrix_assembly.hpp    | 102 ++--
 .../kernel_matrix_assembly_blas.hpp           | 225 +++++----
 .../backends/Kokkos/kernel/predict_kernel.hpp | 461 ++++++++++--------
 src/plssvm/backends/Kokkos/csvm.cpp           | 234 ++++-----
 5 files changed, 772 insertions(+), 607 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
index bddadac01..c024a1362 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
@@ -13,7 +13,8 @@
 #define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"  // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"         // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"  // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
 
@@ -24,8 +25,9 @@ namespace plssvm::kokkos::detail {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  */
-template <typename ExecutionSpace>
+template <typename ExecutionSpace, target_platform target>
 class device_kernel_symm {
     /**
      * @brief The type of the used Kokkos::View.
@@ -38,8 +40,8 @@ class device_kernel_symm {
      * @brief Initialize the Kokkos kernel function object.
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -49,11 +51,11 @@ class device_kernel_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
-    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -69,79 +71,96 @@ class device_kernel_symm {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
-        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
 
-        // calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
-        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
-        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-
-        // create the shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size * sizeof(real_type)));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> A_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // create two scratchpad memory arrays used for caching
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> A_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < (num_rows_ - row_offset_); dim += FEATURE_BLOCK_SIZE_sz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+            const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+            // iterate over all values using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim_block = 0; dim_block < (num_rows_ - device_row_offset_); dim_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the scratchpad memory
+                    // determine on which side of the diagonal we are located
+                    if (dim_block + threadIdx_y < global_j_idx_linear) {
+                        A_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = A_[(dim_block + threadIdx_y) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                    } else {
+                        A_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = A_[global_j_idx_linear * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                    }
 
-                // determine on which side of the diagonal we are located
-                if (dim + threadIdx_y < global_j) {
-                    A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + std::size_t{ 1 }) / std::size_t{ 2 }];
-                } else {
-                    A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + dim + threadIdx_y - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
+                    B_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = B_[(dim_block + device_row_offset_ + threadIdx_y) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
                 }
-                // determine on which side of the diagonal we are located
-                if (dim + threadIdx_y + THREAD_BLOCK_SIZE < global_j) {
-                    A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz + std::size_t{ 1 }) / std::size_t{ 2 }];
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i);
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
                 } else {
-                    A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[global_j * (num_rows_ - row_offset_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz - global_j * (global_j + std::size_t{ 1 }) / std::size_t{ 2 }];
-                }
-
-                B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(dim + row_offset_ + threadIdx_y) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
-                B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(dim + row_offset_ + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
-            }
-            team.team_barrier();  // wait until all threads loaded their part of the data
-
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += A_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i);
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i);
+                            }
+                        }
                     }
                 }
+                team.team_barrier();  // wait until all threads performed their part of the calculations
             }
-            team.team_barrier();  // wait until all threads performed their part of the calculations
         }
 
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
         // apply the (partial) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && device_global_j < device_specific_num_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && device_global_j_idx < device_num_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -151,8 +170,8 @@ class device_kernel_symm {
     /// @cond Doxygen_suppress
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     device_view_type<const real_type> A_;
     device_view_type<const real_type> B_;
@@ -168,8 +187,9 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  */
-template <typename ExecutionSpace>
+template <typename ExecutionSpace, target_platform target>
 class device_kernel_symm_mirror {
     /**
      * @brief The type of the used Kokkos::View.
@@ -183,8 +203,8 @@ class device_kernel_symm_mirror {
      * @param[in] num_rows the number of rows in @p A and @p C
      * @param[in] num_rhs the number of columns in @p B and @p C
      * @param[in] num_mirror_rows the number of rows to mirror down
-     * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
-     * @param[in] row_offset the first row this device is responsible for
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
      * @param[in] alpha the scalar alpha value
      * @param[in] A the matrix @p A
      * @param[in] B the matrix @p B
@@ -194,12 +214,12 @@ class device_kernel_symm_mirror {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
-    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
-        device_specific_num_rows_{ device_specific_num_rows },
-        row_offset_{ row_offset },
+        device_num_rows_{ device_num_rows },
+        device_row_offset_{ device_row_offset },
         alpha_{ alpha },
         A_{ A },
         B_{ B },
@@ -215,69 +235,90 @@ class device_kernel_symm_mirror {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
-        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
 
-        // calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
-        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
-        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-
-        // create the shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size * sizeof(real_type)));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> A_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // create two shared memory arrays used for caching
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> A_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over the remaining features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < device_specific_num_rows_; dim += FEATURE_BLOCK_SIZE_sz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_i = i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                const auto global_j = j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                A_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - (dim + threadIdx_y - std::size_t{ 1 }) * (dim + threadIdx_y) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_y) + global_j];
-                A_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = A_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ - row_offset_ + PADDING_SIZE_sz) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz - std::size_t{ 1 }) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) / std::size_t{ 2 } + device_specific_num_rows_ - (dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) + global_j];
-                B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(row_offset_ + dim + threadIdx_y) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
-                B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = B_[(row_offset_ + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rhs_ + PADDING_SIZE_sz) + global_i];
-            }
-            team.team_barrier();  // wait until all threads loaded their part of the data
-
-            // perform the feature reduction calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += A_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i);
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+            const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_mirror_rows
+
+            // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+            for (std::size_t dim_block = 0; dim_block < device_num_rows_; dim_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_i_idx_linear = i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_j_idx_linear = j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the scratchpad memory
+                    A_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = A_[(dim_block + threadIdx_y) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + threadIdx_y - std::size_t{ 1 }) * (dim_block + threadIdx_y) / std::size_t{ 2 } + device_num_rows_ - (dim_block + threadIdx_y) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                    B_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = B_[(device_row_offset_ + dim_block + threadIdx_y) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                                // SoA
+                }
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            real_type sum{ 0.0 };
+                            for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                                sum += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i);
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                temp[internal_i][internal_j] += A_cache(dim, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j) * B_cache(dim, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i);
+                            }
+                        }
                     }
                 }
+                team.team_barrier();  // wait until all threads performed their part of the calculations
             }
-            team.team_barrier();  // wait until all threads performed their part of the calculations
         }
 
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
+
         // apply the (remaining) BLAS operation and update C
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto partial_global_j = j + static_cast<std::size_t>(internal_j);
-                const auto global_j = row_offset_ + device_specific_num_rows_ + j + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_i < num_rhs_ && partial_global_j < num_mirror_rows_) {
-                    C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j * (num_rhs_ + PADDING_SIZE_sz) + global_i];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                const auto global_j_idx = device_row_offset_ + device_num_rows_ + partial_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses
+                if (global_i_idx < num_rhs_ && partial_global_j_idx < num_mirror_rows_) {
+                    C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx] = alpha_ * temp[internal_i][internal_j] + beta_ * C_[global_j_idx * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                 }
             }
         }
@@ -288,8 +329,8 @@ class device_kernel_symm_mirror {
     const std::size_t num_rows_;
     const std::size_t num_rhs_;
     const std::size_t num_mirror_rows_;
-    const std::size_t device_specific_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_num_rows_;
+    const std::size_t device_row_offset_;
     const real_type alpha_;
     device_view_type<const real_type> A_;
     device_view_type<const real_type> B_;
@@ -338,26 +379,28 @@ class device_kernel_inplace_matrix_add {
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-        // Calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // num_rows
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // num_rhs
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] += rhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j];
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] += rhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx];  // SoA
             }
         }
     }
@@ -410,26 +453,28 @@ class device_kernel_inplace_matrix_scale {
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-        // Calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // num_rows
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // num_rhs
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // calculate the indices used in the current thread
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const auto global_i = i + static_cast<std::size_t>(internal_i);
-                const auto global_j = j + static_cast<std::size_t>(internal_j);
+                // calculate the indices to access the global data
+                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                lhs_[global_i * (num_cols_ + PADDING_SIZE_sz) + global_j] *= scale_;
+                lhs_[global_i_idx * (num_cols_ + PADDING_SIZE_uz) + global_j_idx] *= scale_;  // SoA
             }
         }
     }
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 2a83b311f..8274c4d31 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -17,6 +17,7 @@
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
 #include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
 
@@ -27,10 +28,11 @@ namespace plssvm::kokkos::detail {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `standard_layout_tuple`
  */
-template <typename ExecutionSpace, kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
     /**
      * @brief The type of the used Kokkos::View.
@@ -56,7 +58,7 @@ class device_kernel_assembly {
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
     device_kernel_assembly(device_view_type<real_type> kernel_matrix, device_view_type<real_type> data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, device_view_type<real_type> q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
-        kernel_matrix_{ kernel_matrix_d },
+        kernel_matrix_{ kernel_matrix },
         data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
@@ -77,23 +79,27 @@ class device_kernel_assembly {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
+
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
         constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
         constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
         constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size is too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size is too large
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
 
-        // create two shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_i_cache{ data_cache_ptr, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_j_cache{ data_cache_ptr + shmem_size, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        // create two scratchpad memory arrays used for caching
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        auto *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_i_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_j_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
         // only calculate the upper triangular matrix -> can't use team.team_rank() since all threads in a team must progress further
         if (blockIdx_x >= blockIdx_y) {
@@ -101,30 +107,44 @@ class device_kernel_assembly {
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
             {
-                // calculate the indices used in the current thread paying attention to coalesced memory accesses
-                const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
-                const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;
+                // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+                const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+                const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
                 // iterate over all features using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_features_; dim += THREAD_BLOCK_SIZE_uz) {
-                    // load data into shared memory
+                for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                    // load data into scratchpad memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        // calculate the indices to access the global data points, pays attention to coalesced memory accesses
-                        const auto global_i_linear = device_row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
-                        const auto global_j_linear = device_row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // store the values in the shared memory
-                        data_i_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_linear];
-                        data_j_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_linear];
+                        // store the values in the scratchpad memory
+                        data_i_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                        data_j_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                     }
                     team.team_barrier();  // wait until all threads loaded their part of the data
 
-                    // perform the feature reduction calculation
-                    for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i),
-                                                                                                        data_j_cache(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                   data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                                            data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                }
                             }
                         }
                     }
@@ -133,29 +153,29 @@ class device_kernel_assembly {
             }
 
             // calculate the indices used in the current thread
-            const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
-            const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
+            const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+            const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
 
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    // calculate the indices to access the global data points and wrt the current device
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_i = device_row_offset + device_global_i;
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-                    const auto global_j = device_row_offset + device_global_j;
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_i < (num_rows_ - device_row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
                         real_type temp_ij = temp[internal_i][internal_j];
                         // apply the final kernel function
-                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                        temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp_ij += cost_;
                         }
                         // update the upper triangular kernel matrix
-                        kernel_matrix_[device_global_j * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j * (device_global_j + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i] = temp_ij;
+                        kernel_matrix_[device_global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                     }
                 }
             }
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index b22f69885..15c9239be 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -15,8 +15,9 @@
 
 #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
 
@@ -27,10 +28,11 @@ namespace plssvm::kokkos::detail {
 /**
  * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <typename ExecutionSpace, kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
     /**
      * @brief The type of the used Kokkos::View.
@@ -43,10 +45,10 @@ class device_kernel_assembly_symm {
      * @brief Initialize the Kokkos kernel function object.
      * @param[in] alpha the scalar alpha value
      * @param[in] q the vector used in the dimensional reduction
-     * @param[in] data_d the data points to calculate the implicit kernel matrix from
+     * @param[in] data the data points to calculate the implicit kernel matrix from
      * @param[in] num_rows the total number of data points (= total number of rows)
      * @param[in] device_num_rows the number of rows the current device is responsible for
-     * @param[in] row_offset the first row in @p data_d the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
      * @param[in] num_features the number of features per data point
      * @param[in] QA_cost the scalar used in the dimensional reduction
      * @param[in] cost the cost factor the diagonal is scaled with
@@ -58,13 +60,13 @@ class device_kernel_assembly_symm {
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly_symm(const real_type alpha, device_view_type<const real_type> q, device_view_type<const real_type> data_d, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, device_view_type<const real_type> B, device_view_type<real_type> C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
+    device_kernel_assembly_symm(const real_type alpha, device_view_type<const real_type> q, device_view_type<const real_type> data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, const real_type QA_cost, const real_type cost, device_view_type<const real_type> B, device_view_type<real_type> C, const std::size_t num_classes, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
         alpha_{ alpha },
         q_{ q },
-        data_d_{ data_d },
+        data_{ data },
         num_rows_{ num_rows },
         device_num_rows_{ device_num_rows },
-        row_offset_{ row_offset },
+        device_row_offset_{ device_row_offset },
         num_features_{ num_features },
         QA_cost_{ QA_cost },
         cost_{ cost },
@@ -82,59 +84,81 @@ class device_kernel_assembly_symm {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
+
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
 
         // calculate the indices used in the current thread
-        const auto i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;  // # rhs -> num_rhs
-        const auto i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;  // # rows -> num_mirror_rows
-        const auto j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+        const auto j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const auto i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+        const auto j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
-        // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
+        // get the scratchpad memory pointer for later usage
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+
+        // only calculate the upper triangular matrix -> can't use team.team_rank() since all threads in a team must progress further
         if (blockIdx_x >= blockIdx_y) {
             // create a thread private array used for internal caching
             real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-            // create the shared memory arrays used for caching data point features
-            constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-            real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
-
+            //*************************************************************************//
+            //                   inplace kernel matrix construction                    //
+            //*************************************************************************//
             {
-                // create the shared memory arrays used for caching data point features
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_i{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_j{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+                // reinterpret the scratchpad memory to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_i_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_j_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
                 // iterate over all features using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
-                    // load data into shared memory
+                for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                    // load data into scratchpad memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                        const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                        data_cache_i(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
-                        data_cache_i(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_i];
-                        data_cache_j(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
-                        data_cache_j(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = data_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_sz) + global_j];
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                        // store the values in the scratchpad memory
+                        data_i_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                        data_j_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = data_[(feature_block + threadIdx_y) * (num_rows_ + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                     }
                     team.team_barrier();  // wait until all threads loaded their part of the data
 
-                    // perform the feature reduction calculation
-                    for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_cache_i(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i),
-                                                                                                        data_cache_j(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                real_type sum{ 0.0 };
+                                for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                   data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_i_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_i),
+                                                                                                            data_j_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j));
+                                }
                             }
                         }
                     }
@@ -145,16 +169,18 @@ class device_kernel_assembly_symm {
             // apply the remaining part of the kernel function and store the value in the output kernel matrix
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto device_global_i = i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
-                    const auto device_global_j = j + static_cast<std::size_t>(internal_j);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if ((device_global_i < (num_rows_ - row_offset_) && device_global_j < device_num_rows_ && global_i >= global_j)) {
-                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i] - q_[global_j];
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_i_idx = device_row_offset_ + device_global_i_idx;
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset_ + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                    if (device_global_i_idx < (num_rows_ - device_row_offset_) && device_global_j_idx < device_num_rows_ && global_i_idx >= global_j_idx) {
+                        // apply the final kernel function
+                        temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter_) + QA_cost_ - q_[global_i_idx] - q_[global_j_idx];
                         // apply the cost on the diagonal
-                        if (global_i == global_j) {
+                        if (global_i_idx == global_j_idx) {
                             temp[internal_i][internal_j] += cost_;
                         }
                     } else {
@@ -164,42 +190,44 @@ class device_kernel_assembly_symm {
                 }
             }
 
-            // calculate C += alpha * temp * B for the UPPER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+            //*************************************************************************//
             {
-                // same shared memory size but with different dimensions
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE, FEATURE_BLOCK_SIZE };
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> C_out_cache{ data_cache_ptr + shmem_size, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE, FEATURE_BLOCK_SIZE };
+                // reinterpret the scratchpad memory to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ scratchpad_ptr, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz, THREAD_BLOCK_SIZE_uz };
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> C_out_cache{ scratchpad_ptr + scratchpad_size, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz, THREAD_BLOCK_SIZE_uz };
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) {
-                    // load data into shared memory
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
+                    // load data into scratchpad memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_i_idx_linear = device_row_offset_ + i_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                        B_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y) = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y];
-                        B_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y + THREAD_BLOCK_SIZE) = alpha_ * B_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz];
-                        C_out_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y) = real_type{ 0.0 };
-                        C_out_cache(internal * THREAD_BLOCK_SIZE + threadIdx_x, threadIdx_y + THREAD_BLOCK_SIZE) = real_type{ 0.0 };
+                        // store the values in the scratchpad memory
+                        B_cache(internal * THREAD_BLOCK_SIZE + team_rank_x, team_rank_y) = alpha_ * B_[global_i_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
+                        C_out_cache(internal * THREAD_BLOCK_SIZE + team_rank_x, team_rank_y) = real_type{ 0.0 };                                                                             // SoA
                     }
                     team.team_barrier();  // wait until all threads loaded their part of the data
 
-                    // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    // calculate intermediate results and store them in scratchpad memory
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j, (class_idx + threadIdx_x) % FEATURE_BLOCK_SIZE) +=
-                                    temp[internal_i][internal_j] * B_cache(threadIdx_x * INTERNAL_BLOCK_SIZE + internal_i, (class_idx + threadIdx_x) % FEATURE_BLOCK_SIZE);
+                                C_out_cache(team_rank_y * INTERNAL_BLOCK_SIZE + internal_j, (class_idx + team_rank_x) % THREAD_BLOCK_SIZE) +=
+                                    temp[internal_i][internal_j] * B_cache(team_rank_x * INTERNAL_BLOCK_SIZE + internal_i, (class_idx + team_rank_x) % THREAD_BLOCK_SIZE);
                             }
                         }
                         team.team_barrier();  // wait until all threads performed their part of the calculations
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add intermediate cached results to the C matrix
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal);
-                        Kokkos::atomic_add(&C_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_x], C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal, threadIdx_x));
-                        Kokkos::atomic_add(&C_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_x + THREAD_BLOCK_SIZE_sz], C_out_cache(threadIdx_y * INTERNAL_BLOCK_SIZE + internal, threadIdx_x + THREAD_BLOCK_SIZE));
+                        // calculate the indices to access the global data
+                        const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal);
+
+                        Kokkos::atomic_add(&C_[global_j_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache(team_rank_y * INTERNAL_BLOCK_SIZE + internal, team_rank_x));  // SoA
                     }
                     team.team_barrier();  // wai until all threads updated C with their values
                 }
@@ -208,51 +236,54 @@ class device_kernel_assembly_symm {
             // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal_i);
-                    const auto global_j = row_offset_ + j + static_cast<std::size_t>(internal_j);
+                    // calculate the indices to access the global data
+                    const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal_i);
+                    const auto global_j_idx = device_row_offset_ + j_idx + static_cast<std::size_t>(internal_j);
 
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp[internal_i][internal_j] = real_type{ 0.0 };
                     }
                 }
             }
 
-            // calculate C += alpha * temp * B for the LOWER triangular matrix
+            //*************************************************************************//
+            //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+            //*************************************************************************//
             {
-                // same shared memory size but with different dimensions
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> C_out_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+                // reinterpret the scratchpad memory to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> B_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+                Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> C_out_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
                 // iterate over all classes using blocking to be able to cache them for faster memory accesses
-                for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) {
-                    // load data into shared memory
+                for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
+                    // load data into scratchpad memory
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_j = row_offset_ + j_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+                        // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                        const auto global_j_idx_linear = device_row_offset_ + j_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-                        // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                        B_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y];
-                        B_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_ * B_[global_j * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz];
-                        C_out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
-                        C_out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
+                        // store the values in the scratchpad memory
+                        B_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = alpha_ * B_[global_j_idx_linear * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
+                        C_out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = real_type{ 0.0 };
                     }
                     team.team_barrier();  // wait until all threads loaded their part of the data
 
-                    // calculate intermediate results and store them in shared memory
-                    for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                    // calculate intermediate results and store them in scratchpad memory
+                    for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                             for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                                C_out_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, internal_i * THREAD_BLOCK_SIZE + threadIdx_x) +=
-                                    temp[internal_i][internal_j] * B_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_j);
+                                C_out_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, internal_i * THREAD_BLOCK_SIZE + team_rank_x) +=
+                                    temp[internal_i][internal_j] * B_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, team_rank_y * INTERNAL_BLOCK_SIZE + internal_j);
                             }
                         }
                         team.team_barrier();  // wait until all threads performed their part of the calculations
                     }
 
-                    // add intermediate cached results to C
+                    // atomically add the intermediate cached results to the C matrix
                     for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                        const auto global_i = row_offset_ + i + static_cast<std::size_t>(internal);
-                        Kokkos::atomic_add(&C_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y], C_out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x));
-                        Kokkos::atomic_add(&C_[global_i * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz], C_out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x));
+                        // calculate the indices to access the global data
+                        const auto global_i_idx = device_row_offset_ + i_idx + static_cast<std::size_t>(internal);
+
+                        Kokkos::atomic_add(&C_[global_i_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x));  // SoA
                     }
                     team.team_barrier();  // wait until all threads updated C with their values
                 }
@@ -264,10 +295,10 @@ class device_kernel_assembly_symm {
     /// @cond Doxygen_suppress
     const real_type alpha_;
     device_view_type<const real_type> q_;
-    device_view_type<const real_type> data_d_;
+    device_view_type<const real_type> data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
-    const std::size_t row_offset_;
+    const std::size_t device_row_offset_;
     const std::size_t num_features_;
     const real_type QA_cost_;
     const real_type cost_;
diff --git a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
index 767bfc958..edae863ba 100644
--- a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
@@ -14,8 +14,9 @@
 #pragma once
 
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                         // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
 
@@ -26,8 +27,9 @@ namespace plssvm::kokkos::detail {
 /**
  * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  */
-template <typename ExecutionSpace>
+template <typename ExecutionSpace, target_platform target>
 class device_kernel_w_linear {
     /**
      * @brief The type of the used Kokkos::View.
@@ -38,25 +40,25 @@ class device_kernel_w_linear {
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
-     * @param[in,out] w_d the vector to speedup the linear prediction
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] sv_d the support vectors
+     * @param[in,out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
-     * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
-     * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
      * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
-    device_kernel_w_linear(device_view_type<real_type> w_d, device_view_type<const real_type> alpha_d, device_view_type<const real_type> sv_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_specific_num_sv, const std::size_t sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
-        w_d_{ w_d },
-        alpha_d_{ alpha_d },
-        sv_d_{ sv_d },
+    device_kernel_w_linear(device_view_type<real_type> w, device_view_type<const real_type> alpha, device_view_type<const real_type> support_vectors, const std::size_t num_classes, const std::size_t num_sv, const std::size_t device_num_sv, const std::size_t device_sv_offset, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+        w_{ w },
+        alpha_{ alpha },
+        support_vectors_{ support_vectors },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
-        device_specific_num_sv_{ device_specific_num_sv },
-        sv_offset_{ sv_offset },
+        device_num_sv_{ device_num_sv },
+        device_sv_offset_{ device_sv_offset },
         grid_x_offset_{ grid_x_offset },
         grid_y_offset_{ grid_y_offset },
         grid_size_x_{ grid_size_x } { }
@@ -67,75 +69,100 @@ class device_kernel_w_linear {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
-        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-        // calculate the indices used in the current thread
-        const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;
-        const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;
-        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
 
-        // create the shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_feature{ data_cache_ptr, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_alpha{ data_cache_ptr + shmem_size, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // create two scratchpad memory arrays used for caching
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> feature_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> alpha_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-        for (std::size_t sv = 0; sv < device_specific_num_sv_; sv += THREAD_BLOCK_SIZE_sz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_feature_idx = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_features
+            const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;    // num_classes
 
-                data_cache_feature(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[global_feature_idx * (device_specific_num_sv_ + PADDING_SIZE_sz) + sv + threadIdx_y];  // SoA
-                data_cache_alpha(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[global_class_idx * (num_sv_ + PADDING_SIZE_sz) + sv + sv_offset_ + threadIdx_y];      // AoS
-            }
-            team.team_barrier();  // wait until all threads loaded their part of the data
+            // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+            for (std::size_t sv_block = 0; sv_block < device_num_sv_; sv_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_feature_idx_linear = feature_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                        temp[internal_feature][internal_class] += data_cache_alpha(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_class) * data_cache_feature(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_feature);
+                    // store the values in the scratchpad memory
+                    feature_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = support_vectors_[global_feature_idx_linear * (device_num_sv_ + PADDING_SIZE_uz) + sv_block + threadIdx_y];  // SoA
+                    alpha_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = alpha_[global_class_idx_linear * (num_sv_ + PADDING_SIZE_uz) + sv_block + device_sv_offset_ + threadIdx_y];   // AoS
+                }
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the sv is the fastest moving index
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            real_type sum{ 0.0 };
+                            for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                                sum += alpha_cache(sv, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * feature_cache(sv, team_rank_x * INTERNAL_BLOCK_SIZE + internal_feature);
+                            }
+                            temp[internal_feature][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the sv is the slowest moving index
+                    for (unsigned sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp[internal_feature][internal_class] += alpha_cache(sv, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * feature_cache(sv, team_rank_x * INTERNAL_BLOCK_SIZE + internal_feature);
+                            }
+                        }
                     }
                 }
+                team.team_barrier();  // wait until all threads performed their part of the calculations
             }
-            team.team_barrier();  // wait until all threads performed their part of the calculations
         }
 
-        // update global array with local one
+        // calculate the indices used in the current thread
+        const auto feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+        const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
+
+        // update the global w-vector with the locally cached values
         for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                // calculate the indices to access the global data
                 const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
                 const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                w_d_[global_feature_idx * (num_classes_ + PADDING_SIZE_sz) + global_class_idx] = temp[internal_feature][internal_class];
+                w_[global_feature_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
             }
         }
     }
 
   private:
     /// @cond Doxygen_suppress
-    device_view_type<real_type> w_d_;
-    device_view_type<const real_type> alpha_d_;
-    device_view_type<const real_type> sv_d_;
+    device_view_type<real_type> w_;
+    device_view_type<const real_type> alpha_;
+    device_view_type<const real_type> support_vectors_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
-    const std::size_t device_specific_num_sv_;
-    const std::size_t sv_offset_;
+    const std::size_t device_num_sv_;
+    const std::size_t device_sv_offset_;
     const std::size_t grid_x_offset_;
     const std::size_t grid_y_offset_;
     const std::size_t grid_size_x_;
@@ -143,10 +170,11 @@ class device_kernel_w_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  */
-template <typename ExecutionSpace>
+template <typename ExecutionSpace, target_platform target>
 class device_kernel_predict_linear {
     /**
      * @brief The type of the used Kokkos::View.
@@ -157,10 +185,10 @@ class device_kernel_predict_linear {
   public:
     /**
      * @brief Initialize the Kokkos kernel function object.
-     * @param[out] prediction_d the predicted values
-     * @param[in] w_d the vector to speedup the calculations
-     * @param[in] rho_d the previously learned bias
-     * @param[in] predict_points_d the data points to predict
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_predict_points the number of data points to predict
      * @param[in] num_features the number of features per data point
@@ -168,11 +196,11 @@ class device_kernel_predict_linear {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
-    device_kernel_predict_linear(device_view_type<real_type> prediction_d, device_view_type<const real_type> w_d, device_view_type<const real_type> rho_d, device_view_type<const real_type> predict_points_d, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
-        prediction_d_{ prediction_d },
-        w_d_{ w_d },
-        rho_d_{ rho_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict_linear(device_view_type<real_type> prediction, device_view_type<const real_type> w, device_view_type<const real_type> rho, device_view_type<const real_type> predict_points, const std::size_t num_classes, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+        prediction_{ prediction },
+        w_{ w },
+        rho_{ rho },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_predict_points_{ num_predict_points },
         num_features_{ num_features },
@@ -186,76 +214,97 @@ class device_kernel_predict_linear {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
-        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
 
-        // calculate the indices used in the current thread
-        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;
-        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_sz;
-        const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-
-        // create the shared memory arrays used for caching data point features
-        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_pp{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_w{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+        // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
+
+        // create two scratchpad memory arrays used for caching
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> pp_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+        Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> w_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
 
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
-            // load data into shared memory
-            for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-                const auto global_class_idx = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_sz;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_pp(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
-                data_cache_pp(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
-                data_cache_w(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = w_d_[(dim + threadIdx_y) * (num_classes_ + PADDING_SIZE_sz) + global_class_idx];
-                data_cache_w(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = w_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_classes_ + PADDING_SIZE_sz) + global_class_idx];
-            }
-            team.team_barrier();  // wait until all threads loaded their part of the data
+        {
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;     // num_predict_points
+            const auto class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_classes
 
-            // perform the dot product calculation
-            for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
-                    for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                        temp[internal_pd][internal_class] += data_cache_w(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_class) * data_cache_pp(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_pd);
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
+                for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_class_idx_linear = class_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the scratchpad memory
+                    pp_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = predict_points_[(feature_block + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                    w_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = w_[(feature_block + threadIdx_y) * (num_classes_ + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+                }
+                team.team_barrier();  // wait until all threads loaded their part of the data
+
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            real_type sum{ 0.0 };
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                sum += w_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp);
+                            }
+                            temp[internal_pp][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the feature is the slowest moving index
+                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                temp[internal_pp][internal_class] += w_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_class) * pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp);
+                            }
+                        }
                     }
                 }
+                team.team_barrier();  // wait until all threads performed their part of the calculations
             }
-            team.team_barrier();  // wait until all threads performed their part of the calculations
         }
 
-        // update global array with local one
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        // calculate the indices used in the current thread
+        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+        const auto class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+        // update the global array with the local one
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pd);
+                // calculate the indices to access the global data
+                const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
                 const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + global_class_idx] = temp[internal_pd][internal_class] - rho_d_[global_class_idx];
+                prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_[global_class_idx];  // AoS
             }
         }
     }
 
   private:
     /// @cond Doxygen_suppress
-    device_view_type<real_type> prediction_d_;
-    device_view_type<const real_type> w_d_;
-    device_view_type<const real_type> rho_d_;
-    device_view_type<const real_type> predict_points_d_;
+    device_view_type<real_type> prediction_;
+    device_view_type<const real_type> w_;
+    device_view_type<const real_type> rho_;
+    device_view_type<const real_type> predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_predict_points_;
     const std::size_t num_features_;
@@ -266,12 +315,13 @@ class device_kernel_predict_linear {
 };
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
+ * @brief Predict the @p predict_points using the @p kernel_function.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <typename ExecutionSpace, kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
     /**
      * @brief The type of the used Kokkos::View.
@@ -282,11 +332,11 @@ class device_kernel_predict {
   public:
     /**
      * @brief Initialize the SYCL kernel function object.
-     * @param[in] prediction_d the predicted values
-     * @param[in] alpha_d the previously learned weights
-     * @param[in] rho_d the previously learned biases
-     * @param[in] sv_d the support vectors
-     * @param[in] predict_points_d the data points to predict
+     * @param[in] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned biases
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
      * @param[in] num_classes the number of classes
      * @param[in] num_sv the number of support vectors
      * @param[in] num_predict_points the number of data points to predict
@@ -296,12 +346,12 @@ class device_kernel_predict {
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_predict(device_view_type<real_type> prediction_d, device_view_type<const real_type> alpha_d, device_view_type<const real_type> rho_d, device_view_type<const real_type> sv_d, device_view_type<const real_type> predict_points_d, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
-        prediction_d_{ prediction_d },
-        alpha_d_{ alpha_d },
-        rho_d_{ rho_d },
-        sv_d_{ sv_d },
-        predict_points_d_{ predict_points_d },
+    device_kernel_predict(device_view_type<real_type> prediction, device_view_type<const real_type> alpha, device_view_type<const real_type> rho, device_view_type<const real_type> support_vectors, device_view_type<const real_type> predict_points, const std::size_t num_classes, const std::size_t num_sv, const std::size_t num_predict_points, const std::size_t num_features, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
+        prediction_{ prediction },
+        alpha_{ alpha },
+        rho_{ rho },
+        support_vectors_{ support_vectors },
+        predict_points_{ predict_points },
         num_classes_{ num_classes },
         num_sv_{ num_sv },
         num_predict_points_{ num_predict_points },
@@ -317,55 +367,72 @@ class device_kernel_predict {
      */
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team) const {
+        // cast values to 32-bit unsigned int values to prevent implicit conversions
+        const auto team_rank_x = static_cast<unsigned>(team.team_rank()) / THREAD_BLOCK_SIZE;
+        const auto team_rank_y = static_cast<unsigned>(team.team_rank()) % THREAD_BLOCK_SIZE;
+
         // cast all values to 64-bit std::size_t to prevent potential 32-bit overflows
-        const auto INTERNAL_BLOCK_SIZE_sz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-        const auto THREAD_BLOCK_SIZE_sz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-        const auto FEATURE_BLOCK_SIZE_sz = static_cast<std::size_t>(FEATURE_BLOCK_SIZE);
-        const auto PADDING_SIZE_sz = static_cast<std::size_t>(PADDING_SIZE);
-        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_sz;            // current thread in block x-dimension
-        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_sz;            // current thread in block y-dimension
-        const auto blockDim_x = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block x-dimension
-        const auto blockDim_y = THREAD_BLOCK_SIZE_sz;                                                          // number of threads in block y-dimension
-        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current block in grid x-dimension + offsets if the grid size would be too large
-        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current block in grid y-dimension + offsets if the grid size would be too large
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
-        // calculate the indices used in the current thread
-        const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_sz;
-        const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
-        const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_sz + threadIdx_x;
+        const auto threadIdx_x = static_cast<std::size_t>(team.team_rank()) / THREAD_BLOCK_SIZE_uz;            // current thread in team x-dimension
+        const auto threadIdx_y = static_cast<std::size_t>(team.team_rank()) % THREAD_BLOCK_SIZE_uz;            // current thread in team y-dimension
+        const auto blockDim_x = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team x-dimension
+        const auto blockDim_y = THREAD_BLOCK_SIZE_uz;                                                          // number of threads in team y-dimension
+        const auto blockIdx_x = static_cast<std::size_t>(team.league_rank()) % grid_size_x_ + grid_x_offset_;  // current team in league x-dimension + offsets if the league size is too large
+        const auto blockIdx_y = static_cast<std::size_t>(team.league_rank()) / grid_size_x_ + grid_y_offset_;  // current team in league y-dimension + offsets if the league size is too large
 
-        constexpr std::size_t shmem_size = FEATURE_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE;
-        real_type *data_cache_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(2 * shmem_size));
+        // get the scratchpad memory pointer for later usage
+        constexpr std::size_t scratchpad_size = THREAD_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz * INTERNAL_BLOCK_SIZE_uz;
+        real_type *scratchpad_ptr = static_cast<real_type *>(team.team_shmem().get_shmem(std::size_t{ 2 } * scratchpad_size * sizeof(real_type)));
 
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]{};
 
         {
-            // create the shared memory arrays used for caching data point features
-            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_pp{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> data_cache_sv{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
+            // reinterpret the scratchpad memory to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> pp_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> sv_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_predict_points
+            const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_features_; dim += FEATURE_BLOCK_SIZE_sz) {
-                // load data into shared memory
+            for (std::size_t feature_block = 0; feature_block < num_features_; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const auto global_pp_idx = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
-                    const auto global_sv_idx = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    data_cache_pp(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
-                    data_cache_pp(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = predict_points_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_predict_points_ + PADDING_SIZE_sz) + global_pp_idx];
-                    data_cache_sv(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[(dim + threadIdx_y) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
-                    data_cache_sv(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = sv_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_pp_idx_linear = pp_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+                    const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
+
+                    // store the values in the scratchpad memory
+                    pp_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = predict_points_[(feature_block + threadIdx_y) * (num_predict_points_ + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                    sv_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = support_vectors_[(feature_block + threadIdx_y) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];             // SoA
                 }
                 team.team_barrier();  // wait until all threads loaded their part of the data
 
-                // perform the feature reduction calculation
-                for (unsigned block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the feature reduction calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            temp[internal_pd][internal_sv] += detail::feature_reduce<kernel_function>(data_cache_sv(block_dim, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_sv),
-                                                                                                      data_cache_pp(block_dim, threadIdx_x * INTERNAL_BLOCK_SIZE + internal_pd));
+                            real_type sum{ 0.0 };
+                            for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                                sum += detail::feature_reduce<kernel_function>(sv_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_sv),
+                                                                               pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp));
+                            }
+                            temp[internal_pp][internal_sv] += sum;
+                        }
+                    }
+                } else {
+                    // perform the feature reduction calculation, the feature is the slowest moving index
+                    for (unsigned feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(sv_cache(feature, team_rank_y * INTERNAL_BLOCK_SIZE + internal_sv),
+                                                                                                          pp_cache(feature, team_rank_x * INTERNAL_BLOCK_SIZE + internal_pp));
+                            }
                         }
                     }
                 }
@@ -374,55 +441,57 @@ class device_kernel_predict {
         }
 
         // update temp using the respective kernel function
-        for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                temp[internal_pd][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pd][internal_sv], kernel_function_parameter_);
+                temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter_);
             }
         }
 
         {
-            // create the shared memory arrays used for caching data point features
-            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> alpha_cache{ data_cache_ptr, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> out_cache{ data_cache_ptr + shmem_size, FEATURE_BLOCK_SIZE, INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE };
-
-            // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (std::size_t dim = 0; dim < num_classes_; dim += FEATURE_BLOCK_SIZE_sz) {
-                // load data into shared memory
+            // reinterpret the scratchpad memory to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> alpha_cache{ scratchpad_ptr, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+            Kokkos::mdspan<real_type, Kokkos::dextents<std::size_t, 2>> out_cache{ scratchpad_ptr + scratchpad_size, THREAD_BLOCK_SIZE_uz, INTERNAL_BLOCK_SIZE_uz * THREAD_BLOCK_SIZE_uz };
+
+            // calculate the indices used in the current thread
+            const auto pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+            // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+            const auto sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+            // iterate over all classes using blocking to be able to cache them for faster memory accesses
+            for (std::size_t class_block = 0; class_block < num_classes_; class_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into scratchpad memory
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const std::size_t global_sv_idx = sv_idx_linear + internal * THREAD_BLOCK_SIZE;
-
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    alpha_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
-                    alpha_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = alpha_d_[(dim + threadIdx_y + THREAD_BLOCK_SIZE_sz) * (num_sv_ + PADDING_SIZE_sz) + global_sv_idx];
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const auto global_sv_idx_linear = sv_idx_linear + static_cast<std::size_t>(internal) * THREAD_BLOCK_SIZE_uz;
 
+                    // store the values in the scratchpad memory
+                    alpha_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = alpha_[(class_block + threadIdx_y) * (num_sv_ + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                     // the bias (rho) must only be applied once for all support vectors
                     if (blockIdx_y == std::size_t{ 0 }) {
-                        out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y];
-                        out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = -rho_d_[dim + threadIdx_y + THREAD_BLOCK_SIZE_sz];
+                        out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = -rho_[class_block + threadIdx_y];
                     } else {
-                        out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
-                        out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x) = real_type{ 0.0 };
+                        out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x) = real_type{ 0.0 };
                     }
                 }
                 team.team_barrier();  // wait until all threads loaded their part of the data
 
-                // calculate intermediate results and store them in shared memory
-                for (unsigned class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
-                    for (unsigned internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+                // calculate intermediate results and store them in scratchpad memory
+                for (unsigned class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                         for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                            out_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, internal_pd * THREAD_BLOCK_SIZE + threadIdx_x) +=
-                                temp[internal_pd][internal_sv] * alpha_cache((class_idx + threadIdx_y) % FEATURE_BLOCK_SIZE, threadIdx_y * INTERNAL_BLOCK_SIZE + internal_sv);
+                            out_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, internal_pp * THREAD_BLOCK_SIZE + team_rank_x) +=
+                                temp[internal_pp][internal_sv] * alpha_cache((class_idx + team_rank_y) % THREAD_BLOCK_SIZE, team_rank_y * INTERNAL_BLOCK_SIZE + internal_sv);
                         }
                     }
                     team.team_barrier();  // wait until all threads performed their part of the calculations
                 }
 
-                // add intermediate cached results to prediction_d
+                // atomically add the intermediate cached results to the prediction
                 for (unsigned internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data
                     const auto global_pp_idx = pp_idx + static_cast<std::size_t>(internal);
 
-                    Kokkos::atomic_add(&prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y], out_cache(threadIdx_y, internal * THREAD_BLOCK_SIZE + threadIdx_x));
-                    Kokkos::atomic_add(&prediction_d_[global_pp_idx * (num_classes_ + PADDING_SIZE_sz) + dim + threadIdx_y + THREAD_BLOCK_SIZE_sz], out_cache(threadIdx_y + THREAD_BLOCK_SIZE, internal * THREAD_BLOCK_SIZE + threadIdx_x));
+                    Kokkos::atomic_add(&prediction_[global_pp_idx * (num_classes_ + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache(team_rank_y, internal * THREAD_BLOCK_SIZE + team_rank_x));
                 }
                 team.team_barrier();  // wait until all threads updated their part of the prediction
             }
@@ -431,11 +500,11 @@ class device_kernel_predict {
 
   private:
     /// @cond Doxygen_suppress
-    device_view_type<real_type> prediction_d_;
-    device_view_type<const real_type> alpha_d_;
-    device_view_type<const real_type> rho_d_;
-    device_view_type<const real_type> sv_d_;
-    device_view_type<const real_type> predict_points_d_;
+    device_view_type<real_type> prediction_;
+    device_view_type<const real_type> alpha_;
+    device_view_type<const real_type> rho_;
+    device_view_type<const real_type> support_vectors_;
+    device_view_type<const real_type> predict_points_;
     const std::size_t num_classes_;
     const std::size_t num_sv_;
     const std::size_t num_predict_points_;
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index e18c88328..1e0ca96a5 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -51,9 +51,11 @@
 #include <limits>     // std::numeric_limits::max
 #include <map>        // std::map
 #include <string>     // std::string
-#include <utility>    // std::move
+#include <utility>    // std::move, std::forward
 #include <vector>     // std::vector
 
+namespace {
+
 // a dummy class used as functor to the team_size_max function
 template <typename ExecutionSpace>
 struct dummy {
@@ -61,6 +63,111 @@ struct dummy {
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &) const { }
 };
 
+/**
+ * @brief Run the kernel functor on the given device.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] partial_grid the number of work-groups in each dimension of the execution grid
+ * @param[in] block the number of work-items in each dimension per work-group
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <typename KernelFunctor, typename TeamPolicy, typename... Args>
+void run_kernel_functor(const std::string &kernel_name, const TeamPolicy &policy, Args &&...args) {
+    Kokkos::parallel_for(kernel_name, policy, KernelFunctor{ std::forward<Args>(args)... });
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam ExecutionSpace the used Kokkos execution space
+ * @tparam target the target platform to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <typename, plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename ExecutionSpace, plssvm::target_platform target, typename... Args>
+void dispatch_kernel_function_type(const plssvm::parameter &params, Args &&...args) {
+    switch (params.kernel_type) {
+        case plssvm::kernel_function_type::linear:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::linear>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::kernel_function_type::polynomial:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::rbf:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::rbf, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::sigmoid:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::laplacian:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::laplacian, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::chi_squared:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::chi_squared, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform and kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam ExecutionSpace the used Kokkos execution space
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <typename, plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename ExecutionSpace, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, const plssvm::parameter &params, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::kokkos::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, plssvm::target_platform::gpu_nvidia>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, plssvm::target_platform::gpu_amd>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, plssvm::target_platform::gpu_intel>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, plssvm::target_platform::cpu>(params, std::forward<Args>(args)...);
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam ExecutionSpace the used Kokkos execution space
+ * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <typename, plssvm::target_platform> typename KernelFunctor, typename ExecutionSpace, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::kokkos::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, plssvm::target_platform::gpu_nvidia>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, plssvm::target_platform::gpu_amd>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, plssvm::target_platform::gpu_intel>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            run_kernel_functor<KernelFunctor<ExecutionSpace, plssvm::target_platform::cpu>>(std::forward<Args>(args)...);
+            break;
+    }
+}
+
+}  // namespace
+
 namespace plssvm::kokkos {
 
 void csvm::init(const target_platform target) {
@@ -431,44 +538,7 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            switch (params.kernel_type) {
-                case kernel_function_type::linear:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::linear>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x });
-                    }
-                    break;
-                case kernel_function_type::polynomial:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::rbf:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::rbf, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::sigmoid:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::laplacian:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::laplacian, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::chi_squared:
-                    {
-                        using functor_type = detail::device_kernel_assembly<kokkos_execution_space_type, kernel_function_type::chi_squared, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_explicit_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-            }
+            dispatch_target_platform<detail::device_kernel_assembly, kokkos_execution_space_type>(target_, params, fmt::format("assemble_kernel_matrix_explicit_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x);
         }
         detail::device_synchronize(device);
         const auto end = std::chrono::steady_clock::now();
@@ -505,7 +575,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            Kokkos::parallel_for("blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm<kokkos_execution_space_type>{ num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
+            dispatch_target_platform<detail::device_kernel_symm, kokkos_execution_space_type>(target_, "blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
         }
 
         // save the team size
@@ -521,7 +591,7 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
                 // create a Kokkos TeamPolicy
                 Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, mirror_team_size };
 
-                Kokkos::parallel_for("blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_symm_mirror<kokkos_execution_space_type>{ num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x });
+                dispatch_target_platform<detail::device_kernel_symm_mirror, kokkos_execution_space_type>(target_, "blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
             }
         }
         detail::device_synchronize(device);
@@ -605,44 +675,7 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            switch (params.kernel_type) {
-                case kernel_function_type::linear:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::linear>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x });
-                    }
-                    break;
-                case kernel_function_type::polynomial:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::rbf:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::rbf, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::sigmoid:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::laplacian:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::laplacian, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::chi_squared:
-                    {
-                        using functor_type = detail::device_kernel_assembly_symm<kokkos_execution_space_type, kernel_function_type::chi_squared, real_type>;
-                        Kokkos::parallel_for("assemble_kernel_matrix_implicit_blas_level_3_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-            }
+            dispatch_target_platform<detail::device_kernel_assembly_symm, kokkos_execution_space_type>(target_, params, fmt::format("assemble_kernel_matrix_implicit_blas_level_3_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x);
         }
         detail::device_synchronize(device);
         const auto end = std::chrono::steady_clock::now();
@@ -683,7 +716,7 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            Kokkos::parallel_for("w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), detail::device_kernel_w_linear<kokkos_execution_space_type>{ w_d.get().get<space>(), alpha_d.get().get<space>(), sv_d.get().get<space>(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x });
+            dispatch_target_platform<detail::device_kernel_w_linear, kokkos_execution_space_type>(target_, "w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), w_d.get().get<space>(), alpha_d.get().get<space>(), sv_d.get().get<space>(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x);
         }
         detail::device_synchronize(device);
         const auto end = std::chrono::steady_clock::now();
@@ -719,43 +752,10 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            switch (params.kernel_type) {
-                case kernel_function_type::linear:
-                    {
-                        using functor_type = detail::device_kernel_predict_linear<kokkos_execution_space_type>;
-                        Kokkos::parallel_for("predict_kernel_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), sv_or_w_d.get().get<space>(), rho_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x });
-                    }
-                    break;
-                case kernel_function_type::polynomial:
-                    {
-                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::polynomial, decltype(params.degree), real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("predict_kernel_polynomial", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, params.degree, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::rbf:
-                    {
-                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::rbf, real_type>;
-                        Kokkos::parallel_for("predict_kernel_rbf", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::sigmoid:
-                    {
-                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::sigmoid, real_type, decltype(params.coef0)>;
-                        Kokkos::parallel_for("predict_kernel_sigmoid", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma), params.coef0 });
-                    }
-                    break;
-                case kernel_function_type::laplacian:
-                    {
-                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::laplacian, real_type>;
-                        Kokkos::parallel_for("predict_kernel_laplacian", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
-                case kernel_function_type::chi_squared:
-                    {
-                        using functor_type = detail::device_kernel_predict<kokkos_execution_space_type, kernel_function_type::chi_squared, real_type>;
-                        Kokkos::parallel_for("predict_kernel_chi_squared", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), functor_type{ out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x, std::get<real_type>(params.gamma) });
-                    }
-                    break;
+            if (params.kernel_type == kernel_function_type::linear) {
+                dispatch_target_platform<detail::device_kernel_predict_linear, kokkos_execution_space_type>(target_, "predict_kernel_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), out_d.get().get<space>(), sv_or_w_d.get().get<space>(), rho_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x);
+            } else {
+                dispatch_target_platform<detail::device_kernel_predict, kokkos_execution_space_type>(target_, params, fmt::format("predict_kernel_linear_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x);
             }
         }
         detail::device_synchronize(device);

From 7f8acb565152f9d9b624dc6a4bc154ebcd1aabdc Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 30 Jun 2025 20:38:49 +0200
Subject: [PATCH 60/93] Update power function implementation.

---
 .../Kokkos/kernel/kernel_functions.hpp        | 35 +++----------------
 1 file changed, 5 insertions(+), 30 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
index a2859a294..652aaa25c 100644
--- a/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/kernel_functions.hpp
@@ -45,42 +45,17 @@ template <typename T>
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] KOKKOS_INLINE_FUNCTION real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//

From cfc3d68e8d478a826ca169e4f20f70e329c1fa84 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 1 Jul 2025 12:35:58 +0200
Subject: [PATCH 61/93] Fix Kokkos related compiler error.

---
 tests/backends/generic_base_csvm_tests.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/backends/generic_base_csvm_tests.hpp b/tests/backends/generic_base_csvm_tests.hpp
index f6c95038a..c7f2041e0 100644
--- a/tests/backends/generic_base_csvm_tests.hpp
+++ b/tests/backends/generic_base_csvm_tests.hpp
@@ -807,7 +807,9 @@ TYPED_TEST_P(GenericCSVMSolver, solve_lssvm_system_of_linear_equations) {
     // check the calculated result for correctness
     EXPECT_FLOATING_POINT_MATRIX_NEAR_EPS(calculated_x, correct_x, 1e6);  // due to hand provided results
     for (const auto rho : calculated_rho) {
-        EXPECT_FLOATING_POINT_NEAR_EPS(std::abs(rho), std::abs(calculated_rho.front()), 1e6);  // due to hand provided results
+        const auto rho_abs = std::abs(rho);
+        const auto calculated_rho_abs = std::abs(calculated_rho.front());
+        EXPECT_FLOATING_POINT_NEAR_EPS(rho_abs, calculated_rho_abs, 1e6);  // due to hand provided results
     }
     EXPECT_THAT(num_iters, ::testing::Each(::testing::Gt(0)));
 }

From 31b4f996001eb108f18f8ed72de864ec0a64ee45 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 1 Jul 2025 15:11:34 +0200
Subject: [PATCH 62/93] Update the OpenCL backend kernels. Now: some parts of
 the kernels are specialized for the CPU for better performance.

---
 .../plssvm/backends/OpenCL/detail/utility.hpp |   3 +-
 .../OpenCL/kernel/cg_explicit/blas.cl         | 269 ++++++++++--------
 .../cg_explicit/kernel_matrix_assembly.cl     |  63 ++--
 .../kernel_matrix_assembly_blas.cl            | 209 ++++++++------
 .../OpenCL/kernel/kernel_functions.cl         |  36 +--
 .../backends/OpenCL/kernel/predict_kernel.cl  | 149 +++++-----
 .../OpenCL/kernel/predict_kernel_linear.cl    | 212 ++++++++------
 src/plssvm/backends/OpenCL/csvm.cpp           |   2 +-
 src/plssvm/backends/OpenCL/detail/utility.cpp |  17 +-
 9 files changed, 538 insertions(+), 422 deletions(-)

diff --git a/include/plssvm/backends/OpenCL/detail/utility.hpp b/include/plssvm/backends/OpenCL/detail/utility.hpp
index f2d30947c..a59a70977 100644
--- a/include/plssvm/backends/OpenCL/detail/utility.hpp
+++ b/include/plssvm/backends/OpenCL/detail/utility.hpp
@@ -126,11 +126,12 @@ void device_synchronize(const command_queue &queue);
  *
  * @param[in] comm the MPI communicator
  * @param[in] contexts the used OpenCL contexts
+ * @param[in] target the target platform to create the kernel binaries for
  * @param[in] kernel_function the kernel function
  * @throws plssvm::invalid_file_format_exception if the file couldn't be read using [`std::ifstream::read`](https://en.cppreference.com/w/cpp/io/basic_istream/read)
  * @return [the command queues with all necessary kernels; information regarding the JIT compilation] (`[[nodiscard]]`)
  */
-[[nodiscard]] std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi::communicator &comm, const std::vector<context> &contexts, kernel_function_type kernel_function);
+[[nodiscard]] std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi::communicator &comm, const std::vector<context> &contexts, target_platform target, kernel_function_type kernel_function);
 
 /**
  * @brief Set all arguments in the parameter pack @p args for the kernel @p kernel.
diff --git a/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl b/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl
index 0f1ac247a..8b474df77 100644
--- a/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl
+++ b/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl
@@ -16,8 +16,8 @@
  * @details In a multi-GPU setting, this function is only responsible for the rows this device is responsible for!
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -26,78 +26,90 @@
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, const ulong device_specific_num_rows, const ulong row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) {
+__kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, const ulong device_num_rows, const ulong device_row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-    // calculate the indices used in the current work-item
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;  // #rhs
-    const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;  // # row
-    const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    // create two local memory arrays used for caching
+    __local real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
-    // create a thread private array used for internal caching
+    // create a work-item private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (ulong dim = 0; dim < (num_rows - row_offset); dim += FEATURE_BLOCK_SIZE_ul) {
-        // load data into local memory
-        for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const ulong global_i = i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-            const ulong global_j = j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-            // determine on which side of the diagonal we are located
-            if (dim + get_local_id(1) < global_j) {
-                A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ul) + global_j - (dim + threadIdx_y) * (dim + threadIdx_y + (ulong) 1) / (ulong) 2];
-            } else {
-                A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ul) + dim + threadIdx_y - global_j * (global_j + (ulong) 1) / (ulong) 2];
-            }
-            // determine on which side of the diagonal we are located
-            if (dim + get_local_id(1) + THREAD_BLOCK_SIZE < global_j) {
-                A_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows - row_offset + PADDING_SIZE_ul) + global_j - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (dim + threadIdx_y + THREAD_BLOCK_SIZE_ul + (ulong) 1) / (ulong) 2];
-            } else {
-                A_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = A[global_j * (num_rows - row_offset + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul - global_j * (global_j + (ulong) 1) / (ulong) 2];
-            }
+    {
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
-            B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ul) + global_i];
-            B_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rhs + PADDING_SIZE_ul) + global_i];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+        // iterate over all values using blocking to be able to cache them for faster memory accesses
+        for (ulong dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into local memory
+            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_i_idx_linear = i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                const ulong global_j_idx_linear = j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the local memory
+                // determine on which side of the diagonal we are located
+                if (dim_block + get_local_id(1) < global_j_idx_linear) {
+                    A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx_linear - (dim_block + threadIdx_y) * (dim_block + threadIdx_y + (ulong) 1) / (ulong) 2];  // SoA, upper triangular matrix only
+                } else {
+                    A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[global_j_idx_linear * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + threadIdx_y - global_j_idx_linear * (global_j_idx_linear + (ulong) 1) / (ulong) 2];  // SoA, upper triangular matrix only
+                }
+                B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim_block + device_row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-        // perform the dot product calculation
-        for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+            // perform the dot product calculation, the dim is the fastest moving index
             for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    real_type sum = 0.0;
+                    for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        sum += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
+                    temp[internal_i][internal_j] += sum;
+                }
+            }
+#else
+            // perform the dot product calculation, the dim is the slowest moving index
+            for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
                 }
             }
+#endif
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
     }
 
+    // calculate the indices used in the current work-item
+    const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
     // apply the (partial) BLAS operation and update C
     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const ulong global_i = i + (ulong) internal_i;
-            const ulong device_global_j = j + (ulong) internal_j;
-            const ulong global_j = row_offset + j + (ulong) internal_j;
+            // calculate the indices to access the global data and the data with respect to the current device
+            const ulong global_i_idx = i_idx + (ulong) internal_i;
+            const ulong device_global_j_idx = j_idx + (ulong) internal_j;
+            const ulong global_j_idx = device_row_offset + device_global_j_idx;
 
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && device_global_j < device_specific_num_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i];
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -109,8 +121,8 @@ __kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, cons
  * @param[in] num_rows the number of rows in @p A and @p C
  * @param[in] num_rhs the number of columns in @p B and @p C
  * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
+ * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+ * @param[in] device_row_offset the first row this device is responsible for
  * @param[in] alpha the scalar alpha value
  * @param[in] A the matrix @p A
  * @param[in] B the matrix @p B
@@ -119,68 +131,85 @@ __kernel void device_kernel_symm(const ulong num_rows, const ulong num_rhs, cons
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__kernel void device_kernel_symm_mirror(const ulong num_rows, const ulong num_rhs, const ulong num_mirror_rows, const ulong device_specific_num_rows, const ulong row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) {
+__kernel void device_kernel_symm_mirror(const ulong num_rows, const ulong num_rhs, const ulong num_mirror_rows, const ulong device_num_rows, const ulong device_row_offset, const real_type alpha, const __global real_type *A, const __global real_type *B, const real_type beta, __global real_type *C, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-    // calculate the indices used in the current work-item
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;  // #rhs
-    const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;  // # row
-    const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type A_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type B_cache[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    // create two local memory arrays used for caching
+    __local real_type A_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type B_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
-    // create a thread private array used for internal caching
+    // create a work-item private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (ulong dim = 0; dim < device_specific_num_rows; dim += FEATURE_BLOCK_SIZE_ul) {
-        // load data into local memory
-        for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const ulong global_i = i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-            const ulong global_j = j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y) * (num_rows - row_offset + PADDING_SIZE_ul) - (dim + threadIdx_y - (ulong) 1) * (dim + threadIdx_y) / (ulong) 2 + device_specific_num_rows - (dim + get_local_id(1)) + global_j];
-            A_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows - row_offset + PADDING_SIZE_ul) - (dim + threadIdx_y + THREAD_BLOCK_SIZE_ul - (ulong) 1) * (dim + get_local_id(1) + THREAD_BLOCK_SIZE_ul) / (ulong) 2 + device_specific_num_rows - (dim + get_local_id(1) + THREAD_BLOCK_SIZE_ul) + global_j];
-            B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y) * (num_rhs + PADDING_SIZE_ul) + global_i];
-            B_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(dim + row_offset + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rhs + PADDING_SIZE_ul) + global_i];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+    {
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rhs
+        const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_mirror_rows
+
+        // iterate over the remaining values using blocking to be able to cache them for faster memory accesses
+        for (ulong dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into local memory
+            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_i_idx_linear = i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                const ulong global_j_idx_linear = j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the local memory
+                A_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = A[(dim_block + threadIdx_y) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + threadIdx_y - (ulong) 1) * (dim_block + threadIdx_y) / (ulong) 2 + device_num_rows - (dim_block + threadIdx_y) + global_j_idx_linear];  // SoA, upper triangular matrix only
+                B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = B[(device_row_offset + dim_block + threadIdx_y) * (num_rhs + PADDING_SIZE_uz) + global_i_idx_linear];                                                                                                                                 // SoA
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-        // perform the feature reduction calculation
-        for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+            // perform the dot product calculation, the dim is the fastest moving index
             for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                 for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    temp[internal_i][internal_j] += A_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    real_type sum = 0.0;
+                    for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        sum += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
+                    temp[internal_i][internal_j] += sum;
                 }
             }
+#else
+            // perform the dot product calculation, the dim is the slowest moving index
+            for (uint dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        temp[internal_i][internal_j] += A_cache[dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j] * B_cache[dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i];
+                    }
+                }
+            }
+#endif
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
     }
 
+    // calculate the indices used in the current work-item
+    const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
+    const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_mirror_rows
+
     // apply the (remaining) BLAS operation and update C
     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const ulong global_i = i + (ulong) internal_i;
-            const ulong partial_global_j = j + (ulong) internal_j;
-            const ulong global_j = row_offset + device_specific_num_rows + j + (ulong) internal_j;
+            // calculate the indices to access the global data and the data with respect to the current device
+            const ulong global_i_idx = i_idx + (ulong) internal_i;
+            const ulong partial_global_j_idx = j_idx + (ulong) internal_j;
+            const ulong global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
 
-            // be sure to not perform out of bounds accesses
-            if (global_i < num_rhs && partial_global_j < num_mirror_rows) {
-                C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i] = alpha * temp[internal_i][internal_j] + beta * C[global_j * (num_rhs + PADDING_SIZE_ul) + global_i];
+            // be sure to not perform out-of-bounds accesses
+            if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];  // SoA
             }
         }
     }
@@ -196,23 +225,24 @@ __kernel void device_kernel_symm_mirror(const ulong num_rows, const ulong num_rh
  */
 __kernel void device_kernel_inplace_matrix_add(const ulong num_cols, real_type __global *lhs, const real_type __global *rhs, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-    // calculate the indices used in the current thread
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;  // # num_rows
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;  // # num_rhs
+    // calculate the indices used in the current work-item
+    const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const ulong global_i = i + (ulong) internal_i;
-            const ulong global_j = j + (ulong) internal_j;
+            // calculate the indices to access the global data
+            const ulong global_i_idx = i_idx + (ulong) internal_i;
+            const ulong global_j_idx = j_idx + (ulong) internal_j;
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ul) + global_j] += rhs[global_i * (num_cols + PADDING_SIZE_ul) + global_j];
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] += rhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx];  // SoA
         }
     }
 }
@@ -227,23 +257,24 @@ __kernel void device_kernel_inplace_matrix_add(const ulong num_cols, real_type _
  */
 __kernel void device_kernel_inplace_matrix_scale(const ulong num_cols, real_type __global *lhs, const real_type scale, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-    // calculate the indices used in the current thread
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;  // # num_rows
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;  // # num_rhs
+    // calculate the indices used in the current work-item
+    const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows
+    const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_rhs
 
     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-            const ulong global_i = i + (ulong) internal_i;
-            const ulong global_j = j + (ulong) internal_j;
+            // calculate the indices to access the global data
+            const ulong global_i_idx = i_idx + (ulong) internal_i;
+            const ulong global_j_idx = j_idx + (ulong) internal_j;
 
-            lhs[global_i * (num_cols + PADDING_SIZE_ul) + global_j] *= scale;
+            lhs[global_i_idx * (num_cols + PADDING_SIZE_uz) + global_j_idx] *= scale;  // SoA
         }
     }
 }
diff --git a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl
index 99bc02933..34f6afb48 100644
--- a/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl
+++ b/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl
@@ -40,7 +40,7 @@ __kernel void device_kernel_assembly(__global real_type *kernel_matrix, const __
     const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
     const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
 
-    // create the local memory arrays used for caching data point features
+    // create two local memory arrays used for caching
     __local real_type data_i_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
     __local real_type data_j_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
 
@@ -50,60 +50,73 @@ __kernel void device_kernel_assembly(__global real_type *kernel_matrix, const __
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
         {
-            // calculate the indices used in the current work-item paying attention to coalesced memory accesses
-            const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-            const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
+            // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+            const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+            const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
 
             // iterate over all features using blocking to be able to cache them for faster memory accesses
-            for (ulong dim = 0; dim < num_features; dim += THREAD_BLOCK_SIZE_ul) {
+            for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    // calculate the indices to access the global data points, pays attention to coalesced memory accesses
-                    const ulong global_i_linear = device_row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-                    const ulong global_j_linear = device_row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const ulong global_i_idx_linear = device_row_offset + i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                    const ulong global_j_idx_linear = device_row_offset + j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
 
                     // store the values in the local memory
-                    data_i_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i_linear];
-                    data_j_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j_linear];
+                    data_i_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
                 }
                 barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-                // perform the feature reduction calculation
-                for (uint block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+                // perform the feature reduction calculation, the feature is the fastest moving index
+                for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        real_type sum = 0.0;
+                        for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            sum += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
+                        temp[internal_i][internal_j] += sum;
+                    }
+                }
+#else
+                // perform the feature reduction calculation, the feature is the slowest moving index
+                for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
                     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                            temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
                         }
                     }
                 }
+#endif
                 barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
             }
         }
 
         // calculate the indices used in the current work-item
-        const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-        const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;
+        const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;
+        const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;
 
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                // calculate the indices to access the global data points and wrt the current device
-                const ulong device_global_i = i + (ulong) internal_i;
-                const ulong global_i = device_row_offset + device_global_i;
-                const ulong device_global_j = j + (ulong) internal_j;
-                const ulong global_j = device_row_offset + device_global_j;
+                // calculate the indices to access the global data and the data with respect to the current device
+                const ulong device_global_i_idx = i_idx + (ulong) internal_i;
+                const ulong global_i_idx = device_row_offset + device_global_i_idx;
+                const ulong device_global_j_idx = j_idx + (ulong) internal_j;
+                const ulong global_j_idx = device_row_offset + device_global_j_idx;
 
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if (device_global_i < (num_rows - device_row_offset) && device_global_j < device_num_rows && global_i >= global_j) {
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
                     real_type temp_ij = temp[internal_i][internal_j];
                     // apply the final kernel function
-                    temp_ij = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp_ij PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i] - q[global_j];
+                    temp_ij = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp_ij PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp_ij += cost;
                     }
                     // update the upper triangular kernel matrix
-                    kernel_matrix[device_global_j * (num_rows - device_row_offset + PADDING_SIZE_ul) - device_global_j * (device_global_j + (ulong) 1) / (ulong) 2 + device_global_i] = temp_ij;
+                    kernel_matrix[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + (ulong) 1) / (ulong) 2 + device_global_i_idx] = temp_ij;
                 }
             }
         }
diff --git a/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl b/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl
index cbcbea498..aecb2ab8b 100644
--- a/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl
+++ b/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl
@@ -19,10 +19,10 @@
  * @note The beta factor is already applied to C before this kernel starts!
  * @param[in] alpha the scalar alpha value
  * @param[in] q the vector used in the dimensional reduction
- * @param[in] data_d the data points to calculate the implicit kernel matrix from
+ * @param[in] data the data points to calculate the implicit kernel matrix from
  * @param[in] num_rows the number of data points
  * @param[in] device_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data_d the current device is responsible for
+ * @param[in] device_row_offset the first row in @p data the current device is responsible for
  * @param[in] num_features the number of features per data point
  * @param[in] QA_cost the scalar used in the dimensional reduction
  * @param[in] cost the cost factor the diagonal is scaled with
@@ -33,172 +33,203 @@
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen)
  */
-__kernel void device_kernel_assembly_symm(const real_type alpha, const __global real_type *q, const __global real_type *data_d, const ulong num_rows, const ulong device_num_rows, const ulong row_offset, const ulong num_features, const real_type QA_cost, const real_type cost, const __global real_type *B, __global real_type *C, const ulong num_classes, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
+__kernel void device_kernel_assembly_symm(const real_type alpha, const __global real_type *q, const __global real_type *data, const ulong num_rows, const ulong device_num_rows, const ulong device_row_offset, const ulong num_features, const real_type QA_cost, const real_type cost, const __global real_type *B, __global real_type *C, const ulong num_classes, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-    // calculate the indices used in the current thread
-    const ulong i = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong i_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong j = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong j_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type data_cache_i[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type data_cache_j[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-    // only calculate the upper triangular matrix -> can't use threadIdx since all threads in a warp must progress further
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+    // calculate the indices used in the current work-item
+    const ulong i_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_rows - device_row_offset
+    const ulong j_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // device_num_rows
+
+    // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+    const ulong i_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_rows - device_row_offset
+    const ulong j_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // device_num_rows
+
+    // create two local memory arrays used for caching
+    __local real_type cache_one[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type cache_two[THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+    // only calculate the upper triangular matrix -> can't use threadIdx since all work-items in a warp must progress further
     if (blockIdx_x >= blockIdx_y) {
         // create a thread private array used for internal caching
         real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) {
-            // load data into local memory
-            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-                const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                data_cache_i[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i];
-                data_cache_i[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_i];
-                data_cache_j[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j];
-                data_cache_j[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = data_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_rows + (ulong) 1 + PADDING_SIZE_ul) + global_j];
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+        //*************************************************************************//
+        //                   inplace kernel matrix construction                    //
+        //*************************************************************************//
+        {
+            // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            __local real_type(*data_i_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one;
+            __local real_type(*data_j_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two;
+
+            // iterate over all features using blocking to be able to cache them for faster memory accesses
+            for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                // load data into local memory
+                for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const ulong global_i_idx_linear = device_row_offset + i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                    const ulong global_j_idx_linear = device_row_offset + j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
 
-            // perform the feature reduction calculation
-            for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
+                    // store the values in the local memory
+                    data_i_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_i_idx_linear];  // SoA
+                    data_j_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = data[(feature_block + threadIdx_y) * (num_rows + (ulong) 1 + PADDING_SIZE_uz) + global_j_idx_linear];  // SoA
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+                // perform the feature reduction calculation, the feature is the fastest moving index
                 for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                        temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_i[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_cache_j[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        real_type sum = 0.0;
+                        for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                            sum += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
+                        temp[internal_i][internal_j] += sum;
                     }
                 }
+#else
+                // perform the feature reduction calculation, the feature is the slowest moving index
+                for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                    for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            temp[internal_i][internal_j] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_i_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_i], data_j_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j]);
+                        }
+                    }
+                }
+#endif
+                barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
             }
-            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
 
         // apply the remaining part of the kernel function and store the value in the output kernel matrix
         for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const ulong global_i = row_offset + i + (ulong) internal_i;
-                const ulong device_global_i = i + (ulong) internal_i;
-                const ulong global_j = row_offset + j + (ulong) internal_j;
-                const ulong device_global_j = j + (ulong) internal_j;
-
-                // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                if ((device_global_i < (num_rows - row_offset) && device_global_j < device_num_rows && global_i >= global_j)) {
-                    temp[internal_i][internal_j] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_i][internal_j] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i] - q[global_j];
+                // calculate the indices to access the global data and the data with respect to the current device
+                const ulong device_global_i_idx = i_idx + (ulong) internal_i;
+                const ulong global_i_idx = device_row_offset + device_global_i_idx;
+                const ulong device_global_j_idx = j_idx + (ulong) internal_j;
+                const ulong global_j_idx = device_row_offset + device_global_j_idx;
+
+                // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                    // apply the final kernel function
+                    temp[internal_i][internal_j] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_i][internal_j] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER) + QA_cost - q[global_i_idx] - q[global_j_idx];
                     // apply the cost on the diagonal
-                    if (global_i == global_j) {
+                    if (global_i_idx == global_j_idx) {
                         temp[internal_i][internal_j] += cost;
                     }
                 } else {
+                    // be sure to set the value to zero otherwise
                     temp[internal_i][internal_j] = (real_type) 0.0;
                 }
             }
         }
 
-        // calculate C += alpha * temp * B for the UPPER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the UPPER triangular matrix     //
+        //*************************************************************************//
         {
-            // reinterpret cache arrays with interchanged dimensions
-            __local real_type (*B_cache)[FEATURE_BLOCK_SIZE] = (__local real_type (*)[FEATURE_BLOCK_SIZE]) data_cache_i;
-            __local real_type (*C_out_cache)[FEATURE_BLOCK_SIZE] = (__local real_type (*)[FEATURE_BLOCK_SIZE]) data_cache_j;
+            // reinterpret the local memory arrays to be of shape [INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE]
+            __local real_type(*B_cache)[THREAD_BLOCK_SIZE] = (__local real_type(*)[THREAD_BLOCK_SIZE]) cache_one;
+            __local real_type(*C_out_cache)[THREAD_BLOCK_SIZE] = (__local real_type(*)[THREAD_BLOCK_SIZE]) cache_two;
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (ulong dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ul) {
+            for (ulong class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const ulong global_i = row_offset + i_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const ulong global_i_idx_linear = device_row_offset + i_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = alpha * B[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y];
-                    B_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1 + THREAD_BLOCK_SIZE] = alpha * B[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul];
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = (real_type) 0.0;
-                    C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1 + THREAD_BLOCK_SIZE] = (real_type) 0.0;
+                    // store the values in the local memory
+                    B_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = alpha * B[global_i_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
+                    C_out_cache[internal * THREAD_BLOCK_SIZE + local_id_0][local_id_1] = (real_type) 0.0;                                                                           // SoA
                 }
                 barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
                 // calculate intermediate results and store them in local memory
-                for (uint class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                for (uint class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_0) % FEATURE_BLOCK_SIZE] +=
-                                temp[internal_i][internal_j] * B_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_0) % FEATURE_BLOCK_SIZE];
+                            C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal_j][(class_idx + local_id_0) % THREAD_BLOCK_SIZE] +=
+                                temp[internal_i][internal_j] * B_cache[local_id_0 * INTERNAL_BLOCK_SIZE + internal_i][(class_idx + local_id_0) % THREAD_BLOCK_SIZE];
                         }
                     }
                     barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const ulong global_j = row_offset + j + (ulong) internal;
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_x], C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal][local_id_0]);
-                    atomicAdd(&C[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_x + THREAD_BLOCK_SIZE_ul], C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal][local_id_0 + THREAD_BLOCK_SIZE]);
+                    // calculate the indices to access the global data
+                    const ulong global_j_idx = device_row_offset + j_idx + (ulong) internal;
+
+                    atomicAdd(&C[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_x], C_out_cache[local_id_1 * INTERNAL_BLOCK_SIZE + internal][local_id_0]);  // SoA
                 }
-                barrier(CLK_LOCAL_MEM_FENCE);  // wai until all threads updated C with their values
+                barrier(CLK_LOCAL_MEM_FENCE);  // wai until all work-items updated C with their values
             }
         }
 
         // set potential diagonal entries in temp to 0.0 such that we don't apply the main diagonal twice to C
         for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
             for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const ulong global_i = row_offset + i + (ulong) internal_i;
-                const ulong global_j = row_offset + j + (ulong) internal_j;
+                // calculate the indices to access the global data
+                const ulong global_i_idx = device_row_offset + i_idx + (ulong) internal_i;
+                const ulong global_j_idx = device_row_offset + j_idx + (ulong) internal_j;
 
-                if (global_i == global_j) {
+                if (global_i_idx == global_j_idx) {
                     temp[internal_i][internal_j] = (real_type) 0.0;
                 }
             }
         }
 
-        // calculate C += alpha * temp * B for the LOWER triangular matrix
+        //*************************************************************************//
+        //     calculate C += alpha * temp * B for the LOWER triangular matrix     //
+        //*************************************************************************//
         {
-            // reinterpret cache arrays with interchanged dimensions
-            __local real_type (*B_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type (*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_i;
-            __local real_type (*C_out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type (*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_j;
+            // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+            __local real_type(*B_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one;
+            __local real_type(*C_out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two;
 
             // iterate over all classes using blocking to be able to cache them for faster memory accesses
-            for (ulong dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ul) {
+            for (ulong class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                 // load data into local memory
                 for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const ulong global_j = row_offset + j_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
+                    // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                    const ulong global_j_idx_linear = device_row_offset + j_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
 
-                    // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                    B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha * B[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y];
-                    B_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha * B[global_j * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul];
+                    // store the values in the local memory
+                    B_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha * B[global_j_idx_linear * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y];  // SoA
                     C_out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0;
-                    C_out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0;
                 }
                 barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-                // calculate intermediate results and store them in shared memory
-                for (uint class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
+                // calculate intermediate results and store them in local memory
+                for (uint class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
                     for (uint internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (uint internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                            C_out_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_0] +=
-                                temp[internal_i][internal_j] * B_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j];
+                            C_out_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][internal_i * THREAD_BLOCK_SIZE + local_id_0] +=
+                                temp[internal_i][internal_j] * B_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_j];
                         }
                     }
                     barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
                 }
 
-                // add intermediate cached results to C
+                // atomically add the intermediate cached results to the C matrix
                 for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                    const ulong global_i = row_offset + i + (ulong) internal;
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y], C_out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]);
-                    atomicAdd(&C[global_i * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul], C_out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0]);
+                    // calculate the indices to access the global data
+                    const ulong global_i_idx = device_row_offset + i_idx + (ulong) internal;
+
+                    atomicAdd(&C[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], C_out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]);  // SoA
                 }
-                barrier(CLK_LOCAL_MEM_FENCE);   // wait until all threads updated C with their values
+                barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items updated C with their values
             }
         }
     }
diff --git a/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl b/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl
index 286c9db05..70b66e305 100644
--- a/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl
+++ b/include/plssvm/backends/OpenCL/kernel/kernel_functions.cl
@@ -75,7 +75,7 @@ real_type apply_linear_kernel_function(const real_type value) {
 
 /**
  * @brief Compute the polynomial kernel function using @p value.
- * @details Uses a custom power implementation taking advantage of the fact that degree can only be a positive integer. Hardcodes the power function for degrees <= 6.
+ * @details Uses a custom power implementation taking advantage of the fact that degree can only be a positive integer.
  * @param[in] value the value to apply the polynomial kernel function to
  * @param[in] degree the degree parameter of the polynomial kernel function
  * @param[in] gamma the gamma parameter of the polynomial kernel function
@@ -84,36 +84,12 @@ real_type apply_linear_kernel_function(const real_type value) {
  */
 real_type apply_polynomial_kernel_function(const real_type value, const int degree, const real_type gamma, const real_type coef0) {
     const real_type base = gamma * value + coef0;
-    switch (degree) {
-        case 0: return (real_type) 1.0;
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result = 1.0;
-                for (int i = 0; i < degree; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result = 1.0;
+    for (int i = 0; i < degree; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 /**
diff --git a/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl b/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl
index e37c1dbfb..42edc442f 100644
--- a/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl
+++ b/include/plssvm/backends/OpenCL/kernel/predict_kernel.cl
@@ -14,13 +14,13 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 /**
- * @brief Predict the @p predict_points_d using the kernel function determined at runtime.
+ * @brief Predict the @p predict_points using the kernel function determined at runtime.
  * @details The `PLSSVM_DEVICE_KERNEL_PREDICT_NAME`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST`, `PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER`, `PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION`, and `PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION` placeholder will be replaced by the correct values upon kernel construction.
- * @param[in] prediction_d the predicted values
- * @param[in] alpha_d the previously learned weights
- * @param[in] rho_d the previously learned biases
- * @param[in] sv_d the support vectors
- * @param[in] predict_points_d the data points to predict
+ * @param[in] prediction the predicted values
+ * @param[in] alpha the previously learned weights
+ * @param[in] rho the previously learned biases
+ * @param[in] support_vectors the support vectors
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
  * @param[in] num_predict_points the number of data points to predict
@@ -29,107 +29,126 @@
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  * @param[in] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST a placeholder that is used to string replace the correct kernel parameter (attention: no comma!; Args... only added for Doxygen)
  */
-__kernel void PLSSVM_DEVICE_KERNEL_PREDICT_NAME(__global real_type *prediction_d, const __global real_type *alpha_d, const __global real_type *rho_d, const __global real_type *sv_d, const __global real_type *predict_points_d, const ulong num_classes, const ulong num_sv, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
+__kernel void PLSSVM_DEVICE_KERNEL_PREDICT_NAME(__global real_type *prediction, const __global real_type *alpha, const __global real_type *rho, const __global real_type *support_vectors, const __global real_type *predict_points, const ulong num_classes, const ulong num_sv, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER_LIST) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-    // calculate the indices used in the current thread
-    const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type data_cache_sv[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-    // create a thread private array used for internal caching
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+    // create two local memory arrays used for caching
+    __local real_type cache_one[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type cache_two[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+    // create a work-item private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) {
-        // load data into local memory
-        for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const ulong global_pp_idx = pp_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-            const ulong global_sv_idx = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            data_cache_pp[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx];
-            data_cache_pp[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx];
-            data_cache_sv[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = sv_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ul) + global_sv_idx];
-            data_cache_sv[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = sv_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_sv + PADDING_SIZE_ul) + global_sv_idx];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+    {
+        // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        __local real_type(*pp_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one;
+        __local real_type(*sv_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two;
+
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_predict_points
+        const ulong sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into local memory
+            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_pp_idx_linear = pp_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                const ulong global_sv_idx_linear = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the local memory
+                pp_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                sv_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = support_vectors[(feature_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];             // SoA
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-        // perform the feature reduction calculation
-        for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+            // perform the feature reduction calculation, the feature is the fastest moving index
+            for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    temp[internal_pd][internal_sv] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(data_cache_sv[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv], data_cache_pp[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pd]);
+                    real_type sum = 0.0;
+                    for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        sum += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(sv_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv], pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                    }
+                    temp[internal_pp][internal_sv] += sum;
                 }
             }
+#else
+            // perform the feature reduction calculation, the feature is the slowest moving index
+            for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                        temp[internal_pp][internal_sv] += PLSSVM_OPENCL_FEATURE_REDUCE_FUNCTION(sv_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv], pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp]);
+                    }
+                }
+            }
+#endif
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
     }
 
     // update temp using the respective kernel function
-    for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-            temp[internal_pd][internal_sv] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_pd][internal_sv] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER);
+            temp[internal_pp][internal_sv] = PLSSVM_OPENCL_APPLY_KERNEL_FUNCTION(temp[internal_pp][internal_sv] PLSSVM_OPENCL_KERNEL_FUNCTION_PARAMETER);
         }
     }
 
     {
-        // reinterpret cache arrays with interchanged dimensions
-        __local real_type(*alpha_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_pp;
-        __local real_type(*out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) data_cache_sv;
+        // reinterpret the local memory arrays to be of shape [THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]
+        __local real_type(*alpha_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_one;
+        __local real_type(*out_cache)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE] = (__local real_type(*)[INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE]) cache_two;
 
-        // iterate over all features using blocking to be able to cache them for faster memory accesses
-        for (ulong dim = 0; dim < num_classes; dim += FEATURE_BLOCK_SIZE_ul) {
+        // calculate the indices used in the current thread
+        const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+        // calculate the indices used in the current thread, pays attention to coalesced memory accesses
+        const ulong sv_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_support_vectors
+
+        // iterate over all classes using blocking to be able to cache them for faster memory accesses
+        for (ulong class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             // load data into local memory
             for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-                const ulong global_sv_idx = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-                // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-                alpha_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha_d[(dim + threadIdx_y) * (num_sv + PADDING_SIZE_ul) + global_sv_idx];
-                alpha_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_sv + PADDING_SIZE_ul) + global_sv_idx];
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_sv_idx_linear = sv_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
 
+                // store the values in the local memory
+                alpha_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha[(class_block + threadIdx_y) * (num_sv + PADDING_SIZE_uz) + global_sv_idx_linear];  // AoS
                 // the bias (rho) must only be applied once for all support vectors
                 if (blockIdx_y == (ulong) 0) {
-                    out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = -rho_d[dim + threadIdx_y];
-                    out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = -rho_d[dim + threadIdx_y + THREAD_BLOCK_SIZE_ul];
+                    out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = -rho[class_block + threadIdx_y];
                 } else {
                     out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0;
-                    out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = (real_type) 0.0;
                 }
             }
             barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
             // calculate intermediate results and store them in shared memory
-            for (uint class_idx = 0; class_idx < FEATURE_BLOCK_SIZE; ++class_idx) {
-                for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+            for (uint class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                     for (uint internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                        out_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][internal_pd * THREAD_BLOCK_SIZE + local_id_0] +=
-                            temp[internal_pd][internal_sv] * alpha_cache[(class_idx + local_id_1) % FEATURE_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv];
+                        out_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][internal_pp * THREAD_BLOCK_SIZE + local_id_0] +=
+                            temp[internal_pp][internal_sv] * alpha_cache[(class_idx + local_id_1) % THREAD_BLOCK_SIZE][local_id_1 * INTERNAL_BLOCK_SIZE + internal_sv];
                     }
                 }
                 barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
             }
 
-            // add intermediate cached results to prediction_d
+            // atomically add the intermediate cached results to the prediction
             for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data
                 const ulong global_pp_idx = pp_idx + (ulong) internal;
 
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y], out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]);
-                atomicAdd(&prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ul) + dim + threadIdx_y + THREAD_BLOCK_SIZE_ul], out_cache[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0]);
+                atomicAdd(&prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_block + threadIdx_y], out_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0]);
             }
             barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items updated their part of the prediction
         }
diff --git a/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl b/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
index 5844b3a3a..e3a91f540 100644
--- a/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
+++ b/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
@@ -15,147 +15,183 @@
 
 /**
  * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
- * @param[in,out] w_d the vector to speedup the linear prediction
- * @param[in] alpha_d the previously learned weights
- * @param[in] sv_d the support vectors
+ * @param[in,out] w the vector to speedup the linear prediction
+ * @param[in] alpha the previously learned weights
+ * @param[in] support_vectors the support vectors
  * @param[in] num_classes the number of classes
  * @param[in] num_sv the number of support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first support vector (row in @p alpha_d) the current device is responsible for
+ * @param[in] device_num_sv the number of support vectors the current device is responsible for
+ * @param[in] device_sv_offset the first support vector (row in @p alpha) the current device is responsible for
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__kernel void device_kernel_w_linear(__global real_type *w_d, const __global real_type *alpha_d, const __global real_type *sv_d, const ulong num_classes, const ulong num_sv, const ulong device_specific_num_sv, const ulong sv_offset, const ulong grid_x_offset, const ulong grid_y_offset) {
+__kernel void device_kernel_w_linear(__global real_type *w, const __global real_type *alpha, const __global real_type *support_vectors, const ulong num_classes, const ulong num_sv, const ulong device_num_sv, const ulong device_sv_offset, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-    // calculate the indices used in the current thread
-    const ulong feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type data_cache_feature[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type data_cache_alpha[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-    // create a thread private array used for internal caching
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+    // create two local memory arrays used for caching
+    __local real_type feature_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type alpha_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+    // create a work-item private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-    // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
-    for (ulong sv = 0; sv < device_specific_num_sv; sv += THREAD_BLOCK_SIZE_ul) {
-        // load data into local memory
-        for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const ulong global_feature_idx = feature_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-            const ulong global_class_idx = class_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_ul;
-
-            data_cache_feature[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = sv_d[global_feature_idx * (device_specific_num_sv + PADDING_SIZE_ul) + sv + threadIdx_y];  // SoA
-            data_cache_alpha[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha_d[global_class_idx * (num_sv + PADDING_SIZE_ul) + sv + sv_offset + threadIdx_y];       // AoS
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+    {
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const ulong feature_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_features
+        const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;    // num_classes
+
+        // iterate over all support vectors using blocking to be able to cache them for faster memory accesses
+        for (ulong sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into local memory
+            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_feature_idx_linear = feature_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                const ulong global_class_idx_linear = class_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the local memory
+                feature_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = support_vectors[global_feature_idx_linear * (device_num_sv + PADDING_SIZE_uz) + sv_block + threadIdx_y];  // SoA
+                alpha_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = alpha[global_class_idx_linear * (num_sv + PADDING_SIZE_uz) + sv_block + device_sv_offset + threadIdx_y];    // AoS
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-        // perform the dot product calculation
-        for (uint block_dim = 0; block_dim < THREAD_BLOCK_SIZE; ++block_dim) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+            // perform the dot product calculation, the sv is the fastest moving index
             for (uint internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
                 for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_feature][internal_class] += data_cache_alpha[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_feature[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_feature];
+                    real_type sum = 0.0;
+                    for (uint sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                        sum += alpha_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_feature];
+                    }
+                    temp[internal_feature][internal_class] += sum;
                 }
             }
+#else
+            // perform the dot product calculation, the sv is the slowest moving index
+            for (uint sv = 0; sv < THREAD_BLOCK_SIZE; ++sv) {
+                for (uint internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                    for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_feature][internal_class] += alpha_cache[sv][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * feature_cache[sv][local_id_0 * INTERNAL_BLOCK_SIZE + internal_feature];
+                    }
+                }
+            }
+#endif
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all threads performed their part of the calculations
     }
 
-    // update global array with local one
+    // calculate the indices used in the current work-item
+    const ulong feature_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;  // num_features
+    const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;    // num_classes
+
+    // update the global w-vector with the locally cached values
     for (uint internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
         for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+            // calculate the indices to access the global data
             const ulong global_feature_idx = feature_idx + (ulong) internal_feature;
             const ulong global_class_idx = class_idx + (ulong) internal_class;
 
-            w_d[global_feature_idx * (num_classes + PADDING_SIZE_ul) + global_class_idx] = temp[internal_feature][internal_class];
+            w[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];  // SoA
         }
     }
 }
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
- * @param[out] prediction_d the predicted values
- * @param[in] w_d the vector to speedup the calculations
- * @param[in] rho_d the previously learned bias
- * @param[in] predict_points_d the data points to predict
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
+ * @param[out] prediction the predicted values
+ * @param[in] w the vector to speedup the calculations
+ * @param[in] rho the previously learned bias
+ * @param[in] predict_points the data points to predict
  * @param[in] num_classes the number of classes
  * @param[in] num_predict_points the number of data points to predict
  * @param[in] num_features the number of features per data point
  * @param[in] grid_x_offset the offset in x-dimension into the data points if more than one execution grid has to be used
  * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
  */
-__kernel void device_kernel_predict_linear(__global real_type *prediction_d, const __global real_type *w_d, const __global real_type *rho_d, const __global real_type *predict_points_d, const ulong num_classes, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset) {
+__kernel void device_kernel_predict_linear(__global real_type *prediction, const __global real_type *w, const __global real_type *rho, const __global real_type *predict_points, const ulong num_classes, const ulong num_predict_points, const ulong num_features, const ulong grid_x_offset, const ulong grid_y_offset) {
     // cast values to 32-bit unsigned int values to prevent implicit conversions
     const uint local_id_0 = get_local_id(0);
     const uint local_id_1 = get_local_id(1);
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const ulong threadIdx_x = get_local_id(0);                 // current thread in block x-dimension
-    const ulong threadIdx_y = get_local_id(1);                 // current thread in block y-dimension
-    const ulong blockDim_x = get_local_size(0);                // number of threads in block x-dimension
-    const ulong blockDim_y = get_local_size(1);                // number of threads in block y-dimension
-    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current block in grid x-dimension + offsets if the grid size would be too large
-    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current block in grid y-dimension + offsets if the grid size would be too large
-
-    // calculate the indices used in the current thread
-    const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-    const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_ul;
-    const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_ul + threadIdx_x;
-
-    // create the local memory arrays used for caching data point features
-    __local real_type data_cache_pp[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-    __local real_type data_cache_w[FEATURE_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
-
-    // create a thread private array used for internal caching
+    const ulong threadIdx_x = get_local_id(0);                 // current work-item in work-group x-dimension
+    const ulong threadIdx_y = get_local_id(1);                 // current work-item in work-group y-dimension
+    const ulong blockDim_x = get_local_size(0);                // number of work-items in work-group x-dimension
+    const ulong blockDim_y = get_local_size(1);                // number of work-items in work-group y-dimension
+    const ulong blockIdx_x = get_group_id(0) + grid_x_offset;  // current work-group in global range x-dimension + offsets if the global range is too large
+    const ulong blockIdx_y = get_group_id(1) + grid_y_offset;  // current work-group in global range y-dimension + offsets if the global range is too large
+
+    // create two local memory arrays used for caching
+    __local real_type pp_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+    __local real_type w_cache[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE * THREAD_BLOCK_SIZE];
+
+    // create a work-item private array used for internal caching
     real_type temp[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { (real_type) 0.0 };
 
-    // iterate over all features using blocking to be able to cache them for faster memory accesses
-    for (ulong dim = 0; dim < num_features; dim += FEATURE_BLOCK_SIZE_ul) {
-        // load data into local memory
-        for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
-            const ulong global_pp_idx = pp_idx_linear + internal * THREAD_BLOCK_SIZE;
-            const ulong global_class_idx = class_idx_linear + internal * THREAD_BLOCK_SIZE;
-
-            // FEATURE_BLOCK_SIZE = 2 * THREAD_BLOCK_SIZE -> store twice as many values in the shared memory
-            data_cache_pp[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx];
-            data_cache_pp[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_predict_points + PADDING_SIZE_ul) + global_pp_idx];
-            data_cache_w[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = w_d[(dim + threadIdx_y) * (num_classes + PADDING_SIZE_ul) + global_class_idx];
-            data_cache_w[local_id_1 + THREAD_BLOCK_SIZE][internal * THREAD_BLOCK_SIZE + local_id_0] = w_d[(dim + threadIdx_y + THREAD_BLOCK_SIZE_ul) * (num_classes + PADDING_SIZE_ul) + global_class_idx];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
+    {
+        // calculate the indices used in the current work-item, pays attention to coalesced memory accesses
+        const ulong pp_idx_linear = blockIdx_x * blockDim_x * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;     // num_predict_points
+        const ulong class_idx_linear = blockIdx_y * blockDim_y * INTERNAL_BLOCK_SIZE_uz + threadIdx_x;  // num_classes
+
+        // iterate over all features using blocking to be able to cache them for faster memory accesses
+        for (ulong feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+            // load data into local memory
+            for (uint internal = 0; internal < INTERNAL_BLOCK_SIZE; ++internal) {
+                // calculate the indices to access the global data, pays attention to coalesced memory accesses
+                const ulong global_pp_idx_linear = pp_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+                const ulong global_class_idx_linear = class_idx_linear + (ulong) internal * THREAD_BLOCK_SIZE_uz;
+
+                // store the values in the local memory
+                pp_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = predict_points[(feature_block + threadIdx_y) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx_linear];  // SoA
+                w_cache[local_id_1][internal * THREAD_BLOCK_SIZE + local_id_0] = w[(feature_block + threadIdx_y) * (num_classes + PADDING_SIZE_uz) + global_class_idx_linear];                    // SoA
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items loaded their part of the data
 
-        // perform the dot product calculation
-        for (uint block_dim = 0; block_dim < FEATURE_BLOCK_SIZE; ++block_dim) {
-            for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+#if defined(PLSSVM_OPENCL_TARGET_CPUS)
+            // perform the feature reduction calculation, the feature is the fastest moving index
+            for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    temp[internal_pd][internal_class] += data_cache_w[block_dim][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * data_cache_pp[block_dim][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pd];
+                    real_type sum = 0.0 : for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                        sum += w_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp];
+                    }
+                    temp[internal_pp][internal_class] += sum;
                 }
             }
+#else
+            // perform the feature reduction calculation, the feature is the slowest moving index
+            for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                    for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                        temp[internal_pp][internal_class] += w_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp];
+                    }
+                }
+            }
+#endif
+            barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
         }
-        barrier(CLK_LOCAL_MEM_FENCE);  // wait until all work-items performed their part of the calculations
     }
 
-    // update global array with local one
-    for (uint internal_pd = 0; internal_pd < INTERNAL_BLOCK_SIZE; ++internal_pd) {
+    // calculate the indices used in the current work-item
+    const ulong pp_idx = (blockIdx_x * blockDim_x + threadIdx_x) * INTERNAL_BLOCK_SIZE_uz;     // num_predict_points
+    const ulong class_idx = (blockIdx_y * blockDim_y + threadIdx_y) * INTERNAL_BLOCK_SIZE_uz;  // num_classes
+
+    // update the global array with the local one
+    for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
         for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-            const ulong global_pp_idx = pp_idx + (ulong) internal_pd;
+            // calculate the indices to access the global data
+            const ulong global_pp_idx = pp_idx + (ulong) internal_pp;
             const ulong global_class_idx = class_idx + (ulong) internal_class;
 
-            prediction_d[global_pp_idx * (num_classes + PADDING_SIZE_ul) + global_class_idx] = temp[internal_pd][internal_class] - rho_d[global_class_idx];
+            prediction[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho[global_class_idx];  // AoS
         }
     }
 }
diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp
index 0f69a606c..0c58a48d6 100644
--- a/src/plssvm/backends/OpenCL/csvm.cpp
+++ b/src/plssvm/backends/OpenCL/csvm.cpp
@@ -97,7 +97,7 @@ csvm::csvm(const target_platform target) {
 
     // create command_queues and JIT compile OpenCL kernels; compile all kernels for float and double
     detail::jit_info info{};
-    std::tie(devices_, info) = detail::create_command_queues(comm_, contexts_, params_.kernel_type);
+    std::tie(devices_, info) = detail::create_command_queues(comm_, contexts_, target_, params_.kernel_type);
 
     std::vector<std::string> device_names{};
     device_names.reserve(devices_.size());
diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp
index e3202bb6b..4dd6a3499 100644
--- a/src/plssvm/backends/OpenCL/detail/utility.cpp
+++ b/src/plssvm/backends/OpenCL/detail/utility.cpp
@@ -213,7 +213,7 @@ std::vector<std::pair<compute_kernel_name, std::string>> kernel_type_to_function
     return kernels;
 }
 
-std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi::communicator &comm, const std::vector<context> &contexts, const kernel_function_type kernel_function) {
+std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi::communicator &comm, const std::vector<context> &contexts, const target_platform target, const kernel_function_type kernel_function) {
     jit_info info{};
     const auto jit_start_time = std::chrono::steady_clock::now();
 
@@ -336,6 +336,8 @@ std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi:
     // replace the generic strings in the kernel_src_string
     replace_kernel_function_type_placeholders(kernel_src_string, kernel_function);
 
+    // TODO: use defines? -DTHREAD_BLOCK_SIZE=32 ...
+
     // read generic predict kernel
     std::ifstream predict_file{ base_path / "predict_kernel.cl" };
     std::string predict_kernel_src_string{};
@@ -358,14 +360,21 @@ std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi:
 
     // replace constants in kernel_src_string
     // replace the size_t variants -> BEFORE replacing the "normal" values
-    ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE_ul", fmt::format("(ulong) {}", THREAD_BLOCK_SIZE));
-    ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE_ul", fmt::format("(ulong) {}", INTERNAL_BLOCK_SIZE));
-    ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE_ul", fmt::format("(ulong) {}", PADDING_SIZE));
+    ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE_uz", fmt::format("(ulong) {}", THREAD_BLOCK_SIZE));
+    ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE_uz", fmt::format("(ulong) {}", INTERNAL_BLOCK_SIZE));
+    ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE_uz", fmt::format("(ulong) {}", PADDING_SIZE));
     // replace the normal variants
     ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE", fmt::format("{}", THREAD_BLOCK_SIZE));
     ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE", fmt::format("{}", INTERNAL_BLOCK_SIZE));
     ::plssvm::detail::replace_all(kernel_src_string, "PADDING_SIZE", fmt::format("{}", PADDING_SIZE));
 
+    // set compile definition checking whether we are executing on a CPU or not
+    for (std::string &options : compile_options) {
+        if (target == target_platform::cpu) {
+            options += " -DPLSSVM_OPENCL_TARGET_CPUS";
+        }
+    }
+
     // get all device names
     std::vector<std::string> device_names{};
     for (auto &context : contexts) {

From b3f8b2266d39a2d57d3d6b767e464e367309a978 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 1 Jul 2025 15:28:30 +0200
Subject: [PATCH 63/93] Fix OpenCL kernel.

---
 include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl b/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
index e3a91f540..6424029eb 100644
--- a/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
+++ b/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
@@ -160,7 +160,8 @@ __kernel void device_kernel_predict_linear(__global real_type *prediction, const
             // perform the feature reduction calculation, the feature is the fastest moving index
             for (uint internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (uint internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    real_type sum = 0.0 : for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
+                    real_type sum = 0.0;
+                    for (uint feature = 0; feature < THREAD_BLOCK_SIZE; ++feature) {
                         sum += w_cache[feature][local_id_1 * INTERNAL_BLOCK_SIZE + internal_class] * pp_cache[feature][local_id_0 * INTERNAL_BLOCK_SIZE + internal_pp];
                     }
                     temp[internal_pp][internal_class] += sum;

From dc6c267243409289d038bc8d5deafe121be5be63 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Jul 2025 16:29:03 +0200
Subject: [PATCH 64/93] Rename variable for better consistency with other
 backends.

---
 .../SYCL/kernel/cg_explicit/basic/blas.hpp    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
index 01f0de2f4..120f637b9 100644
--- a/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
+++ b/include/plssvm/backends/SYCL/kernel/cg_explicit/basic/blas.hpp
@@ -90,15 +90,15 @@ class device_kernel_symm {
 
                         real_type sum{ 0.0 };
                         for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
-                            real_type A_val = 0.0;
+                            real_type A_cache = 0.0;
                             // determine on which side of the diagonal we are located
                             if (dim_block + dim < global_j_idx) {
-                                A_val = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                                A_cache = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                             } else {
-                                A_val = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                                A_cache = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                             }
 
-                            sum += A_val * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
+                            sum += A_cache * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                         }
                         temp[internal_i][internal_j] += sum;
                     }
@@ -112,15 +112,15 @@ class device_kernel_symm {
                             const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
                             const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
-                            real_type A_val = 0.0;
+                            real_type A_cache = 0.0;
                             // determine on which side of the diagonal we are located
                             if (dim_block + dim < global_j_idx) {
-                                A_val = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                                A_cache = A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                             } else {
-                                A_val = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
+                                A_cache = A_[global_j_idx * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                             }
 
-                            temp[internal_i][internal_j] += A_val * B_[((dim_block + dim) + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
+                            temp[internal_i][internal_j] += A_cache * B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];  // SoA
                         }
                     }
                 }
@@ -228,7 +228,7 @@ class device_kernel_symm_mirror {
                         const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
 
                         real_type sum{ 0.0 };
-                        for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                        for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
                             sum += A_[(dim_block + dim) * (num_rows_ - device_row_offset_ + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows_ - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
                                    B_[(dim_block + dim + device_row_offset_) * (num_rhs_ + PADDING_SIZE_uz) + global_i_idx];                                                                                                                         // SoA
                         }
@@ -237,7 +237,7 @@ class device_kernel_symm_mirror {
                 }
             } else {
                 // perform the dot product calculation, the dim is the slowest moving index
-                for (unsigned dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
+                for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE; ++dim) {
                     for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                         for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
                             // calculate the indices to access the global data

From 23d6350f83bbc8240c74ff3983eec687c1d7e4ed Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Jul 2025 16:31:04 +0200
Subject: [PATCH 65/93] Make blocking sizes constexpr instead of only const.

---
 .../plssvm/backends/HPX/kernel/cg_explicit/blas.hpp  | 12 ++++++------
 .../kernel/cg_explicit/kernel_matrix_assembly.hpp    |  6 +++---
 .../cg_implicit/kernel_matrix_assembly_blas.hpp      |  4 ++--
 .../plssvm/backends/HPX/kernel/predict_kernel.hpp    | 12 ++++++------
 .../backends/OpenMP/kernel/cg_explicit/blas.hpp      | 12 ++++++------
 .../kernel/cg_explicit/kernel_matrix_assembly.hpp    |  6 +++---
 .../cg_implicit/kernel_matrix_assembly_blas.hpp      |  4 ++--
 .../plssvm/backends/OpenMP/kernel/predict_kernel.hpp | 12 ++++++------
 8 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
index 99aeec376..ac09b09b0 100644
--- a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
@@ -54,9 +54,9 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
     const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     // define the range over which should be iterated
     std::vector<std::size_t> range(blocked_num_rhs * blocked_device_num_rows);
@@ -139,9 +139,9 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
     const auto blocked_num_mirror_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_mirror_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     // define the range over which should be iterated
     std::vector<std::size_t> range(blocked_num_rhs * blocked_num_mirror_rows);
diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
index f4bf41d0d..f1cf4723e 100644
--- a/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -59,9 +59,9 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type
     const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
     // define the range over which should be iterated
     std::vector<std::size_t> indices(blocked_row_range * blocked_device_num_rows);
diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index d6abc8cab..03d85ab7d 100644
--- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -65,8 +65,8 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
     const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
     // define the range over which should be iterated
     std::vector<std::size_t> indices(blocked_row_range * blocked_device_num_rows);
diff --git a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
index e98d09a58..f215afb8e 100644
--- a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
@@ -54,8 +54,8 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
     const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
     // define the range over which should be iterated
     std::vector<std::size_t> range(blocked_num_classes * blocked_num_features);
@@ -123,8 +123,8 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
     const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
     // define the range over which should be iterated
     std::vector<std::size_t> range(blocked_device_num_predict_points * blocked_num_classes);
@@ -199,8 +199,8 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
     const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
     // define the range over which should be iterated
     std::vector<std::size_t> range(blocked_device_num_predict_points * blocked_num_support_vectors);
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
index 01db6a60e..f3235bd75 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
@@ -49,9 +49,9 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
     const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2)
     for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) {
@@ -138,9 +138,9 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
     const auto blocked_num_mirror_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_mirror_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2)
     for (std::size_t rhs_block = 0; rhs_block < blocked_num_rhs; rhs_block += THREAD_BLOCK_SIZE_uz) {
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
index aa465dead..70d2f9edb 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -54,9 +54,9 @@ void device_kernel_assembly(real_type *kernel_matrix, const soa_matrix<real_type
     const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
 
 #pragma omp parallel for collapse(2) schedule(dynamic)
     for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) {
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 952225c06..d3ab90721 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -57,8 +57,8 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
     const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2) schedule(dynamic)
     for (std::size_t row_block = 0; row_block < blocked_row_range; row_block += THREAD_BLOCK_SIZE_uz) {
diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index d8cd4a0be..cd26b1b43 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -47,8 +47,8 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
     const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2) default(none) shared(w, support_vectors, alpha) firstprivate(blocked_num_classes, blocked_num_features, num_classes, num_features, device_num_sv, device_sv_offset)
     for (std::size_t feature_block = 0; feature_block < blocked_num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
@@ -120,8 +120,8 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
     const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2) default(none) shared(prediction, w, rho, predict_points) firstprivate(blocked_device_num_predict_points, blocked_num_classes, device_num_predict_points, num_classes, num_features, device_row_offset)
     for (std::size_t pp_block = 0; pp_block < blocked_device_num_predict_points; pp_block += THREAD_BLOCK_SIZE_uz) {
@@ -200,8 +200,8 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
     const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
 
     // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+    constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+    constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
 
 #pragma omp parallel for collapse(2)
     for (std::size_t pp_idx = 0; pp_idx < device_num_predict_points; ++pp_idx) {

From 3eea8733186e2520a06b0ef090711bffee8d3bb7 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Jul 2025 16:34:51 +0200
Subject: [PATCH 66/93] Update comments.

---
 include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp   | 4 ++--
 .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp    | 2 +-
 include/plssvm/backends/HPX/kernel/predict_kernel.hpp     | 8 ++++----
 .../plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp    | 4 ++--
 .../kernel/cg_implicit/kernel_matrix_assembly_blas.hpp    | 2 +-
 include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp  | 8 ++++----
 6 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
index ac09b09b0..a6d25adc6 100644
--- a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
@@ -70,7 +70,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all values
+        // iterate over all values using blocking
         for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
             // perform the dot product calculation
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
@@ -155,7 +155,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over the remaining values
+        // iterate over the remaining values using blocking
         for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
             // perform the dot product calculation
             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 03d85ab7d..875579f94 100644
--- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -85,7 +85,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
             //*************************************************************************//
             //                   inplace kernel matrix construction                    //
             //*************************************************************************//
-            // iterate over all features
+            // iterate over all features using blocking
             for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                 for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                     for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
diff --git a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
index f215afb8e..d5e811c63 100644
--- a/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/HPX/kernel/predict_kernel.hpp
@@ -69,7 +69,7 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all support vectors
+        // iterate over all support vectors using blocking
         for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
             // perform the dot product calculation
             for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
@@ -138,7 +138,7 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all features
+        // iterate over all features using blocking
         for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
             // perform the dot product calculation
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
@@ -214,7 +214,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
         // create a thread private array used for internal caching
         std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-        // iterate over all features
+        // iterate over all features using blocking
         for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
             // perform the feature reduction calculation
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
@@ -239,7 +239,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
             }
         }
 
-        // add results to prediction
+        // atomically add the results to the prediction
         for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
index f3235bd75..a370f206b 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
@@ -66,7 +66,7 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    // iterate over all values
+                    // iterate over all values using blocking
                     for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
@@ -155,7 +155,7 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    // iterate over the remaining values
+                    // iterate over the remaining values using blocking
                     for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index d3ab90721..332a0a26a 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -78,7 +78,7 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
                         //*************************************************************************//
                         //                   inplace kernel matrix construction                    //
                         //*************************************************************************//
-                        // iterate over all features
+                        // iterate over all features using blocking
                         for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                             for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
                                 for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
diff --git a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
index cd26b1b43..741c696af 100644
--- a/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/predict_kernel.hpp
@@ -63,7 +63,7 @@ inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<re
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    // iterate over all support vectors
+                    // iterate over all support vectors using blocking
                     for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
@@ -136,7 +136,7 @@ inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, cons
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    // iterate over all features
+                    // iterate over all features using blocking
                     for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the dot product calculation
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
@@ -223,7 +223,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                     // create a thread private array used for internal caching
                     std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
 
-                    // iterate over all features
+                    // iterate over all features using blocking
                     for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
                         // perform the feature reduction calculation
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
@@ -248,7 +248,7 @@ inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_m
                         }
                     }
 
-                    // add results to prediction
+                    // atomically add the results to the prediction
                     for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
                         for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                             for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {

From 1c4e4798197ae147f2e36badd3677017da7c9f79 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Jul 2025 16:35:33 +0200
Subject: [PATCH 67/93] Update formatting for better consistency with the other
 backends.

---
 .../plssvm/backends/HPX/kernel/cg_explicit/blas.hpp    | 10 +++++-----
 .../plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
index a6d25adc6..579a56715 100644
--- a/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_explicit/blas.hpp
@@ -84,11 +84,11 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
                         real_type A_cache = 0.0;
                         // determine on which side of the diagonal we are located
                         if (dim_block + dim < global_j_idx) {
-                            A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];
+                            A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                         } else {
-                            A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
+                            A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                         }
-                        sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
+                        sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset);
                     }
                     temp[internal_j][internal_i] += sum;
                 }
@@ -166,8 +166,8 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
 
                     real_type sum{ 0.0 };
                     for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
-                        const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx];
-                        sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
+                        sum += A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                               B(global_i_idx, dim_block + dim + device_row_offset);                                                                                                                                                         // SoA
                     }
                     temp[internal_j][internal_i] += sum;
                 }
diff --git a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
index a370f206b..01b1ec54a 100644
--- a/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/OpenMP/kernel/cg_explicit/blas.hpp
@@ -80,11 +80,11 @@ inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num
                                     real_type A_cache = 0.0;
                                     // determine on which side of the diagonal we are located
                                     if (dim_block + dim < global_j_idx) {
-                                        A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                        A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                                     } else {
-                                        A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                        A_cache = A[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];  // SoA, upper triangular matrix only
                                     }
-                                    sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
+                                    sum += A_cache * B(global_i_idx, dim_block + dim + device_row_offset);
                                 }
                                 temp[internal_j][internal_i] += sum;
                             }
@@ -166,8 +166,8 @@ inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::siz
 
                                 real_type sum{ 0.0 };
                                 for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
-                                    const real_type A_cache = A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx];
-                                    sum += A_cache * B(global_i_idx, device_row_offset + dim_block + dim);
+                                    sum += A[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                                           B(global_i_idx, dim_block + dim + device_row_offset);                                                                                                                                                         // SoA
                                 }
                                 temp[internal_j][internal_i] += sum;
                             }

From a029f23a0aae731ec5b9bf5d41ac332986e4c691 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Jul 2025 16:35:54 +0200
Subject: [PATCH 68/93] Change THREAD_BLOCK_SIZE to THREAD_BLOCK_SIZE_uz.

---
 .../HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 875579f94..7b8d79e1b 100644
--- a/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/HPX/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -138,12 +138,12 @@ inline void device_kernel_assembly_symm(const real_type alpha, const std::vector
 
                         if (global_i_idx == global_j_idx) {
                             // only apply once to the diagonal
-                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
                                 atomic_ref<real_type>{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);
                             }
                         } else {
                             // apply it for the upper and lower triangular matrix
-                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE; ++class_idx) {
+                            for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
                                 atomic_ref<real_type>{ C(class_block + class_idx, global_i_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_j_idx);
                                 // symmetry
                                 atomic_ref<real_type>{ C(class_block + class_idx, global_j_idx) } += alpha * temp[internal_j][internal_i] * B(class_block + class_idx, global_i_idx);

From efda49dab54862fb8577112011e95065a51fed46 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Jul 2025 17:16:37 +0200
Subject: [PATCH 69/93] Fix documentation error using q vector instead of w
 vector.

---
 include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp        | 2 +-
 include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl  | 2 +-
 .../backends/SYCL/kernel/predict/basic/predict_kernel.hpp       | 2 +-
 .../SYCL/kernel/predict/hierarchical/predict_kernel.hpp         | 2 +-
 .../backends/SYCL/kernel/predict/scoped/predict_kernel.hpp      | 2 +-
 .../backends/SYCL/kernel/predict/work_group/predict_kernel.hpp  | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
index edae863ba..0bb5ecf6b 100644
--- a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
@@ -25,7 +25,7 @@
 namespace plssvm::kokkos::detail {
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  * @tparam target the target platform
  */
diff --git a/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl b/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
index 6424029eb..1d579b40d 100644
--- a/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
+++ b/include/plssvm/backends/OpenCL/kernel/predict_kernel_linear.cl
@@ -14,7 +14,7 @@
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @param[in,out] w the vector to speedup the linear prediction
  * @param[in] alpha the previously learned weights
  * @param[in] support_vectors the support vectors
diff --git a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
index c2243a755..9e838a89c 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/basic/predict_kernel.hpp
@@ -28,7 +28,7 @@
 namespace plssvm::sycl::detail::basic {
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @details Uses SYCL's basic data parallel kernels.
  * @tparam target the target platform
  */
diff --git a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
index 3eecedb19..ea8bd5b6e 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/hierarchical/predict_kernel.hpp
@@ -28,7 +28,7 @@
 namespace plssvm::sycl::detail::hierarchical {
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @details Uses SYCL's hierarchical data parallel kernels.
  * @tparam target the target platform
  */
diff --git a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
index 738bb1dd5..e26025670 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/scoped/predict_kernel.hpp
@@ -28,7 +28,7 @@
 namespace plssvm::sycl::detail::scoped {
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @details Uses AdaptiveCpp's scoped parallelism.
  * @tparam target the target platform
  */
diff --git a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
index ab1bdae44..bef23d533 100644
--- a/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
+++ b/include/plssvm/backends/SYCL/kernel/predict/work_group/predict_kernel.hpp
@@ -28,7 +28,7 @@
 namespace plssvm::sycl::detail::work_group {
 
 /**
- * @brief Calculate the `q` vector used to speedup the prediction using the linear kernel function.
+ * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
  * @details Uses SYCL's work-group data parallel kernels.
  * @tparam target the target platform
  */

From 8ecd618f2f11735a3d5cb24b15f33dcf90fdc18e Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Jul 2025 17:21:59 +0200
Subject: [PATCH 70/93] Update the stdpar backend kernels. Now: some parts of
 the kernels are specialized for the CPU for better performance.

---
 .../stdpar/kernel/cg_explicit/blas.hpp        | 344 +++++++------
 .../cg_explicit/kernel_matrix_assembly.hpp    | 183 ++++---
 .../kernel_matrix_assembly_blas.hpp           | 222 +++++----
 .../stdpar/kernel/kernel_functions.hpp        |  35 +-
 .../backends/stdpar/kernel/predict_kernel.hpp | 462 ++++++++++--------
 src/plssvm/backends/stdpar/csvm.cpp           | 196 +++++---
 6 files changed, 850 insertions(+), 592 deletions(-)

diff --git a/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp
index 63e9f9831..a1dc4864a 100644
--- a/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp
@@ -13,10 +13,11 @@
 #define PLSSVM_BACKENDS_STDPAR_KERNEL_CG_EXPLICIT_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"      // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
-#include "plssvm/detail/assert.hpp"  // PLSSVM_ASSERT
-#include "plssvm/matrix.hpp"         // plssvm::soa_matrix
-#include "plssvm/shape.hpp"          // plssvm::shape
+#include "plssvm/constants.hpp"         // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/detail/assert.hpp"     // PLSSVM_ASSERT
+#include "plssvm/matrix.hpp"            // plssvm::soa_matrix
+#include "plssvm/shape.hpp"             // plssvm::shape
+#include "plssvm/target_platforms.hpp"  // plssvm::target_platform
 
 #include <algorithm>  // std::for_each
 #include <array>      // std::array
@@ -29,157 +30,218 @@
 namespace plssvm::stdpar::detail {
 
 /**
- * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a symmetric matrix (memory optimized), @p B and @p C are matrices, and @p alpha and @p beta are scalars.
- * @param[in] num_rows the number of rows in @p A and @p C
- * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
- * @param[in] alpha the scalar alpha value
- * @param[in] A the matrix @p A
- * @param[in] B the matrix @p B
- * @param[in] beta the scalar beta value
- * @param[in,out] C the matrix @p C, also used as result matrix
+ * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
+ * @tparam target the target platform
  */
-inline void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
-    PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
-
-    // calculate constants
-    const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_num_rhs * blocked_device_specific_num_rows);
-    std::iota(range.begin(), range.end(), 0);
-
-    std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t rhs = idx / blocked_device_specific_num_rows;
-        const std::size_t row = idx % blocked_device_specific_num_rows;
-
-        const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz;
-
-        // create a thread private array used for internal caching
-        std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-        // iterate over all features
-        for (std::size_t dim = 0; dim < (num_rows - row_offset); ++dim) {
-            // perform the dot product calculation
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                    const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                    real_type A_val = 0.0;
-                    // determine on which side of the diagonal we are located
-                    if (dim < global_row) {
-                        A_val = A_ptr[dim * (num_rows - row_offset + PADDING_SIZE_uz) + global_row - dim * (dim + std::size_t{ 1 }) / std::size_t{ 2 }];
-                    } else {
-                        A_val = A_ptr[global_row * (num_rows - row_offset + PADDING_SIZE_uz) + dim - global_row * (global_row + std::size_t{ 1 }) / std::size_t{ 2 }];
+template <target_platform target>
+struct device_kernel_symm {
+    /**
+     * @brief Perform an explicit BLAS SYMM operation.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     */
+    void operator()(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+        PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
+        PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
+        PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
+        PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
+        PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
+
+        // calculate constants
+        const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> range(blocked_num_rhs * blocked_device_num_rows);
+        std::iota(range.begin(), range.end(), 0);
+
+        std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A, B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+
+            // create a thread private array used for internal caching
+            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+            // iterate over all values
+            for (std::size_t dim_block = 0; dim_block < (num_rows - device_row_offset); dim_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                                real_type A_cache = 0.0;
+                                // determine on which side of the diagonal we are located
+                                if (dim_block + dim < global_j_idx) {
+                                    A_cache = A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                } else {
+                                    A_cache = A_ptr[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                }
+                                sum += A_cache * B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx];
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                                real_type A_cache = 0.0;
+                                // determine on which side of the diagonal we are located
+                                if (dim_block + dim < global_j_idx) {
+                                    A_cache = A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) + global_j_idx - (dim_block + dim) * (dim_block + dim + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                } else {
+                                    A_cache = A_ptr[global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) + dim_block + dim - global_j_idx * (global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 }];
+                                }
+                                temp[internal_i][internal_j] += A_cache * B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx];
+                            }
+                        }
                     }
-                    temp[internal_i][internal_j] += A_val * B_ptr[(dim + row_offset) * (num_rhs + PADDING_SIZE_uz) + global_rhs];
                 }
             }
-        }
-
-        // apply the (partial) BLAS operation and update C
-        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_rhs < num_rhs && device_global_row < device_specific_num_rows) {
-                    C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs];
+
+            // apply the (partial) BLAS operation and update C
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses
+                    if (global_i_idx < num_rhs && device_global_j_idx < device_num_rows) {
+                        C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];
+                    }
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
- * @param[in] num_rows the number of rows in @p A and @p C
- * @param[in] num_rhs the number of columns in @p B and @p C
- * @param[in] num_mirror_rows the number of rows to mirror down
- * @param[in] device_specific_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
- * @param[in] row_offset the first row this device is responsible for
- * @param[in] alpha the scalar alpha value
- * @param[in] A the matrix @p A
- * @param[in] B the matrix @p B
- * @param[in] beta the scalar beta value
- * @param[in,out] C the matrix @p C, also used as result matrix
+ * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
+ * @tparam target the target platform
  */
-inline void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type alpha, const std::vector<real_type> &A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
-    // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
-    PLSSVM_ASSERT(!A.empty(), "A matrix may not be empty!");
-    PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
-    PLSSVM_ASSERT(num_rows >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows);
-    PLSSVM_ASSERT(num_rows >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, num_rows);
-
-    // calculate constants
-    const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_num_mirror_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_mirror_rows) / INTERNAL_BLOCK_SIZE));
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_num_rhs * blocked_num_mirror_rows);  // define range over which should be iterated
-    std::iota(range.begin(), range.end(), 0);
-
-    std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t rhs = idx / blocked_num_mirror_rows;
-        const std::size_t row = idx % blocked_num_mirror_rows;
-
-        const std::size_t rhs_idx = rhs * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t row_idx = row * INTERNAL_BLOCK_SIZE_uz;
-
-        // create a thread private array used for internal caching
-        std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-        // iterate over all features
-        for (std::size_t dim = 0; dim < device_specific_num_rows; ++dim) {
-            // perform the dot product calculation
-            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                    const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                    const std::size_t global_row = row_idx + static_cast<std::size_t>(internal_j);
-
-                    const real_type A_val = A_ptr[dim * (num_rows - row_offset + PADDING_SIZE_uz) - (dim - std::size_t{ 1 }) * dim / std::size_t{ 2 } + device_specific_num_rows - dim + global_row];
-                    temp[internal_i][internal_j] += A_val * B_ptr[(dim + row_offset) * (num_rhs + PADDING_SIZE_uz) + global_rhs];
+template <target_platform target>
+struct device_kernel_symm_mirror {
+    /**
+     * @brief Perform an explicit BLAS SYMM operation.
+     * @param[in] num_rows the number of rows in @p A and @p C
+     * @param[in] num_rhs the number of columns in @p B and @p C
+     * @param[in] num_mirror_rows the number of rows to mirror down
+     * @param[in] device_num_rows the number of rows in @p A and number of rows in @p B; thr rows in @p A are potentially distributed across multiple devices
+     * @param[in] device_row_offset the first row this device is responsible for
+     * @param[in] alpha the scalar alpha value
+     * @param[in] A the matrix @p A
+     * @param[in] B the matrix @p B
+     * @param[in] beta the scalar beta value
+     * @param[in,out] C the matrix @p C, also used as result matrix
+     */
+    void operator()(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, const real_type *A, const soa_matrix<real_type> &B, const real_type beta, soa_matrix<real_type> &C) {
+        // compute: C = alpha * A * B + beta * C with A in m x k, B in n x k, and C in n x m, alpha, beta as scalar
+        PLSSVM_ASSERT(A != nullptr, "The A matrix result pointer must be valid!");
+        PLSSVM_ASSERT(B.shape() == (plssvm::shape{ num_rhs, num_rows }), "B matrix sizes mismatch!: {} != [{}, {}]", B.shape(), num_rhs, num_rows);
+        PLSSVM_ASSERT(C.shape() == (plssvm::shape{ num_rhs, num_rows }), "C matrix sizes mismatch!: {} != [{}, {}]", C.shape(), num_rhs, num_rows);
+        PLSSVM_ASSERT(num_rows >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, num_rows);
+        PLSSVM_ASSERT(num_rows >= num_mirror_rows, "The number of mirror rows ({}) cannot be greater the the total number of rows ({})!", num_mirror_rows, num_rows);
+        PLSSVM_ASSERT(num_rows >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, num_rows);
+
+        // calculate constants
+        const auto blocked_num_rhs = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rhs) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_num_mirror_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_mirror_rows) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> range(blocked_num_rhs * blocked_num_mirror_rows);
+        std::iota(range.begin(), range.end(), 0);
+
+        std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, A_ptr = A, B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t i_idx = (idx / blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t j_idx = (idx % blocked_num_mirror_rows) * INTERNAL_BLOCK_SIZE_uz;
+
+            // create a thread private array used for internal caching
+            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+            // iterate over the remaining values
+            for (std::size_t dim_block = 0; dim_block < device_num_rows; dim_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the dim is the fastest moving index
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                                sum += A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                                       B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx];                                                                                                                        // SoA
+                            }
+                            temp[internal_i][internal_j] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the dim is the slowest moving index
+                    for (std::size_t dim = 0; dim < THREAD_BLOCK_SIZE_uz; ++dim) {
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+
+                                temp[internal_i][internal_j] += A_ptr[(dim_block + dim) * (num_rows - device_row_offset + PADDING_SIZE_uz) - (dim_block + dim - std::size_t{ 1 }) * (dim_block + dim) / std::size_t{ 2 } + device_num_rows - (dim_block + dim) + global_j_idx] *  // SoA, upper triangular matrix only
+                                                                B_ptr[(dim_block + dim + device_row_offset) * (num_rhs + PADDING_SIZE_uz) + global_i_idx];                                                                                                                        // SoA
+                            }
+                        }
+                    }
                 }
             }
-        }
-
-        // apply the (partial) BLAS operation and update C
-        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
-            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
-                const std::size_t global_rhs = rhs_idx + static_cast<std::size_t>(internal_i);
-                const std::size_t partial_global_row = row_idx + static_cast<std::size_t>(internal_j);
-                const std::size_t global_row = row_offset + device_specific_num_rows + row_idx + static_cast<std::size_t>(internal_j);
-
-                // be sure to not perform out of bounds accesses
-                if (global_rhs < num_rhs && partial_global_row < num_mirror_rows) {
-                    C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_row * (num_rhs + PADDING_SIZE_uz) + global_rhs];
+
+            // apply the (remaining) BLAS operation and update C
+            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                    const auto partial_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                    const auto global_j_idx = device_row_offset + device_num_rows + partial_global_j_idx;
+
+                    // be sure to not perform out-of-bounds accesses
+                    if (global_i_idx < num_rhs && partial_global_j_idx < num_mirror_rows) {
+                        C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx] = alpha * temp[internal_i][internal_j] + beta * C_ptr[global_j_idx * (num_rhs + PADDING_SIZE_uz) + global_i_idx];
+                    }
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 }  // namespace plssvm::stdpar::detail
 
diff --git a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 51e11a282..fdb869351 100644
--- a/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -17,11 +17,12 @@
 #include "plssvm/constants.hpp"                                // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                            // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
-#include "plssvm/matrix.hpp"                                   // plssvm::aos_matrix
+#include "plssvm/matrix.hpp"                                   // plssvm::soa_matrix
+#include "plssvm/target_platforms.hpp"                         // plssvm::target_platform
 
 #include <algorithm>  // std::for_each
 #include <array>      // std::array
-#include <cmath>      // std::ceil, std::sqrt
+#include <cmath>      // std::ceil
 #include <cstddef>    // std::size_t
 #include <execution>  // std::execution::par_unseq
 #include <numeric>    // std::iota
@@ -30,88 +31,118 @@
 namespace plssvm::stdpar::detail {
 
 /**
- * @brief Assemble the kernel matrix using the @p kernel function.
- * @tparam kernel the compile-time kernel function to use
- * @tparam Args the types of the potential additional arguments for the @p kernel function
- * @param[out] kernel_matrix the resulting kernel matrix
- * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
- * @param[in] q the `q` vector
- * @param[in] QA_cost he bottom right matrix entry multiplied by cost
- * @param[in] cost 1 / the cost parameter in the C-SVM
- * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
+ * @brief Create the explicit kernel matrix using the @p kernel_function.
+ * @tparam target the target platform
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel, typename... Args>
-void device_kernel_assembly(std::vector<real_type> &kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
-    PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(!kernel_matrix.empty(), "A matrix may not be empty!");
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
-    PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
-
-    // calculate constants
-    const std::size_t num_rows = data.num_rows() - 1;
-    const std::size_t num_features = data.num_cols();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // count the number of entries in the final index list
-    std::vector<std::size_t> indices(blocked_row_range * blocked_device_specific_num_rows);  // define range over which should be iterated
-    std::iota(indices.begin(), indices.end(), 0);
-
-    std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), kernel_matrix_ptr = kernel_matrix.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-
-        // only calculate the upper triangular matrix
-        if (row_idx >= col_idx) {
-            // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs
-            // create a thread private array used for internal caching
-            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-            // iterate over all features
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
-                // perform the feature reduction calculation
-                for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                    for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                        const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                        const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                        temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_row], data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_col]);
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
+struct device_kernel_assembly {
+    /**
+     * @brief Assemble the kernel matrix using the specified kernel function.
+     * @param[out] kernel_matrix the resulting kernel matrix
+     * @param[in] data the data matrix
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
+     * @param[in] q the `q` vector
+     * @param[in] QA_cost he bottom right matrix entry multiplied by cost
+     * @param[in] cost 1 / the cost parameter in the C-SVM
+     * @param[in] kernel_function_parameter the potential additional arguments for the kernel function
+     */
+    void operator()(real_type *kernel_matrix, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector<real_type> &q, const real_type QA_cost, const real_type cost, Args... kernel_function_parameter) {
+        PLSSVM_ASSERT(kernel_matrix != nullptr, "The kernel matrix result pointer must be valid!");
+        PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
+        PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+        PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
+        PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
+
+        // calculate constants
+        const std::size_t num_rows = data.num_rows() - 1;
+        const std::size_t num_features = data.num_cols();
+        const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> indices(blocked_row_range * blocked_device_num_rows);
+        std::iota(indices.begin(), indices.end(), 0);
+
+        std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), kernel_matrix_ptr = kernel_matrix](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+
+            // only calculate the upper triangular matrix
+            if (i_idx >= j_idx) {
+                // create a thread private array used for internal caching
+                std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+                // iterate over all features
+                for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                    if constexpr (target != target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                                real_type sum{ 0.0 };
+                                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                   data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    // calculate the indices to access the global data
+                                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                                            data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                                }
+                            }
+                        }
                     }
                 }
-            }
 
-            // apply the remaining part of the kernel function and store the value in the output kernel matrix
-            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                    // calculate the indices to access the kernel matrix (the part stored on the current device)
-                    const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                        real_type temp_ij = temp[internal_row][internal_col];
-                        temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q_ptr[global_row] - q_ptr[global_col];
-                        // apply the cost on the diagonal
-                        if (global_row == global_col) {
-                            temp_ij += cost;
+                // apply the remaining part of the kernel function and store the value in the output kernel matrix
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data and the data with respect to the current device
+                        const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_i_idx = device_row_offset + device_global_i_idx;
+                        const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                        const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                        // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                        if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                            real_type temp_ij = temp[internal_i][internal_j];
+                            // apply the final kernel function
+                            temp_ij = detail::apply_kernel_function<kernel_function>(temp_ij, kernel_function_parameter...) + QA_cost - q_ptr[global_i_idx] - q_ptr[global_j_idx];
+                            // apply the cost on the diagonal
+                            if (global_i_idx == global_j_idx) {
+                                temp_ij += cost;
+                            }
+                            // update the upper triangular kernel matrix
+                            kernel_matrix_ptr[device_global_j_idx * (num_rows - device_row_offset + PADDING_SIZE_uz) - device_global_j_idx * (device_global_j_idx + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_i_idx] = temp_ij;
                         }
-                        kernel_matrix_ptr[device_global_col * (num_rows - row_offset + PADDING_SIZE_uz) - device_global_col * (device_global_col + std::size_t{ 1 }) / std::size_t{ 2 } + device_global_row] = temp_ij;
                     }
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 }  // namespace plssvm::stdpar::detail
 
diff --git a/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index fdebd9cb5..8aaa10792 100644
--- a/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -18,8 +18,8 @@
 #include "plssvm/constants.hpp"                                // plssvm::real_type
 #include "plssvm/detail/assert.hpp"                            // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
-#include "plssvm/kernel_functions.hpp"                         // plssvm::kernel_function
-#include "plssvm/matrix.hpp"                                   // aos_matrix
+#include "plssvm/matrix.hpp"                                   // plssvm::soa_matrix
+#include "plssvm/target_platforms.hpp"                         // plssvm::target_platform
 
 #include <algorithm>  // std::for_each
 #include <array>      // std::array
@@ -32,100 +32,152 @@
 namespace plssvm::stdpar::detail {
 
 /**
- * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
- * @tparam kernel the compile-time kernel function to use
- * @tparam Args the types of the potential additional arguments for the @p kernel function
- * @param[in] alpha the scalar alpha value
- * @param[in] q the `q` vector
- * @param[in] data the data matrix
- * @param[in] device_specific_num_rows the number of rows the current device is responsible for
- * @param[in] row_offset the first row in @p data the current device is responsible for
- * @param[in] QA_cost he bottom right matrix entry multiplied by cost
- * @param[in] cost 1 / the cost parameter in the C-SVM
- * @param[in] B the matrix @p B
- * @param[in,out] C the matrix @p C
- * @param[in] kernel_function_parameter the potential additional arguments for the @p kernel function
+ * @brief Perform an implicit BLAS SYMM-like operation: `C = alpha * A * B + C` where `A` is the implicitly calculated kernel matrix using the @p kernel_function (never actually stored, reducing the amount of needed global memory), @p B and @p C are matrices, and @p alpha is a scalar.
+ * @tparam target the target platform
+ * @tparam kernel_function the type of the used kernel function
+ * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_assembly_symm(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_specific_num_rows, const std::size_t row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
-    PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
-    PLSSVM_ASSERT(q.size() >= device_specific_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_specific_num_rows, q.size());
-    PLSSVM_ASSERT(q.size() >= row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", row_offset, q.size());
-    PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
-    PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!");
-    PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size());
-
-    // calculate constants
-    const std::size_t num_rows = data.num_rows() - 1;
-    const std::size_t num_features = data.num_cols();
-    const std::size_t num_classes = B.num_rows();
-    const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - row_offset) / INTERNAL_BLOCK_SIZE));
-    const auto blocked_device_specific_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_rows) / INTERNAL_BLOCK_SIZE));
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // count the number of entries in the final index list
-    std::vector<std::size_t> indices(blocked_row_range * blocked_device_specific_num_rows);  // define range over which should be iterated
-    std::iota(indices.begin(), indices.end(), 0);
-
-    std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t row_idx = (idx / blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t col_idx = (idx % blocked_device_specific_num_rows) * INTERNAL_BLOCK_SIZE_uz;
-
-        // only calculate the upper triangular matrix
-        if (row_idx >= col_idx) {
-            // only calculate the upper triangular matrix -> done be only iterating over valid row <-> col pairs
-            // create a thread private array used for internal caching
-            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-            // iterate over all features
-            for (std::size_t dim = 0; dim < num_features; ++dim) {
-                for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                    for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                        const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                        const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                        temp[internal_row][internal_col] += detail::feature_reduce<kernel>(data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_row], data_ptr[dim * (num_rows + 1 + PADDING_SIZE_uz) + global_col]);
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
+struct device_kernel_assembly_symm {
+    /**
+     * @brief Perform an implicit BLAS SYMM-like operation.
+     * @param[in] alpha the scalar alpha value
+     * @param[in] q the `q` vector
+     * @param[in] data the data matrix
+     * @param[in] device_num_rows the number of rows the current device is responsible for
+     * @param[in] device_row_offset the first row in @p data the current device is responsible for
+     * @param[in] QA_cost he bottom right matrix entry multiplied by cost
+     * @param[in] cost 1 / the cost parameter in the C-SVM
+     * @param[in] B the matrix @p B
+     * @param[in,out] C the matrix @p C
+     * @param[in] kernel_function_parameter the potential additional arguments for the kernel function
+     */
+    void operator()(const real_type alpha, const std::vector<real_type> &q, const soa_matrix<real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type QA_cost, const real_type cost, const soa_matrix<real_type> &B, soa_matrix<real_type> &C, Args... kernel_function_parameter) {
+        PLSSVM_ASSERT(q.size() == data.num_rows() - 1, "Sizes mismatch!: {} != {}", q.size(), data.num_rows() - 1);
+        PLSSVM_ASSERT(q.size() >= device_num_rows, "The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", device_num_rows, q.size());
+        PLSSVM_ASSERT(q.size() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of rows ({})!", device_row_offset, q.size());
+        PLSSVM_ASSERT(cost != real_type{ 0.0 }, "cost must not be 0.0 since it is 1 / plssvm::cost!");
+        PLSSVM_ASSERT(B.shape() == C.shape(), "The matrices B and C must have the same shape!");
+        PLSSVM_ASSERT(B.num_cols() == q.size(), "The number of columns in B ({}) must be the same as the values in q ({})!", B.num_cols(), q.size());
+
+        // calculate constants
+        const std::size_t num_rows = data.num_rows() - 1;
+        const std::size_t num_features = data.num_cols();
+        const std::size_t num_classes = B.num_rows();
+        const auto blocked_row_range = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_rows - device_row_offset) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_device_num_rows = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_rows) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> indices(blocked_row_range * blocked_device_num_rows);
+        std::iota(indices.begin(), indices.end(), 0);
+
+        std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), [=, q_ptr = q.data(), data_ptr = data.data(), B_ptr = B.data(), C_ptr = C.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t i_idx = (idx / blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t j_idx = (idx % blocked_device_num_rows) * INTERNAL_BLOCK_SIZE_uz;
+
+            // only calculate the upper triangular matrix
+            if (i_idx >= j_idx) {
+                // create a thread private array used for internal caching
+                std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+                //*************************************************************************//
+                //                   inplace kernel matrix construction                    //
+                //*************************************************************************//
+                // iterate over all features
+                for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                    if constexpr (target == target_platform::cpu) {
+                        // perform the feature reduction calculation, the feature is the fastest moving index
+                        for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                            for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                // calculate the indices to access the global data
+                                const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                                real_type sum{ 0.0 };
+                                for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                    sum += detail::feature_reduce<kernel_function>(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                   data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                                }
+                                temp[internal_i][internal_j] += sum;
+                            }
+                        }
+                    } else {
+                        // perform the feature reduction calculation, the feature is the slowest moving index
+                        for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                            for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                                for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                                    // calculate the indices to access the global data
+                                    const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                                    const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                                    temp[internal_i][internal_j] += detail::feature_reduce<kernel_function>(data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_i_idx],   // SoA
+                                                                                                            data_ptr[(feature_block + feature) * (num_rows + std::size_t{ 1 } + PADDING_SIZE_uz) + global_j_idx]);  // SoA
+                                }
+                            }
+                        }
                     }
                 }
-            }
 
-            // apply the remaining part of the kernel function and store the value in the output kernel matrix
-            for (unsigned internal_row = 0; internal_row < INTERNAL_BLOCK_SIZE; ++internal_row) {
-                for (unsigned internal_col = 0; internal_col < INTERNAL_BLOCK_SIZE; ++internal_col) {
-                    const std::size_t device_global_row = row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t global_row = row_offset + row_idx + static_cast<std::size_t>(internal_row);
-                    const std::size_t device_global_col = col_idx + static_cast<std::size_t>(internal_col);
-                    const std::size_t global_col = row_offset + col_idx + static_cast<std::size_t>(internal_col);
-
-                    // be sure to not perform out of bounds accesses for the kernel matrix (only using the upper triangular matrix)
-                    if (device_global_row < (num_rows - row_offset) && device_global_col < device_specific_num_rows && global_row >= global_col) {
-                        real_type temp_ij = temp[internal_row][internal_col];
-                        temp_ij = detail::apply_kernel_function<kernel>(temp_ij, kernel_function_parameter...) + QA_cost - q_ptr[global_row] - q_ptr[global_col];
-                        // apply the cost on the diagonal
-                        if (global_row == global_col) {
-                            temp_ij += cost;
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
-                                atomic_ref<real_type>{ C_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx] } += alpha * temp_ij * B_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx];
+                // apply the remaining part of the kernel function and store the value in the output kernel matrix
+                for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                    for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                        // calculate the indices to access the global data and the data with respect to the current device
+                        const auto device_global_i_idx = i_idx + static_cast<std::size_t>(internal_i);
+                        const auto global_i_idx = device_row_offset + device_global_i_idx;
+                        const auto device_global_j_idx = j_idx + static_cast<std::size_t>(internal_j);
+                        const auto global_j_idx = device_row_offset + device_global_j_idx;
+
+                        // be sure to not perform out-of-bounds accesses (only using the upper triangular matrix)
+                        if (device_global_i_idx < (num_rows - device_row_offset) && device_global_j_idx < device_num_rows && global_i_idx >= global_j_idx) {
+                            // apply the final kernel function
+                            temp[internal_i][internal_j] = detail::apply_kernel_function<kernel_function>(temp[internal_i][internal_j], kernel_function_parameter...) + QA_cost - q_ptr[global_i_idx] - q_ptr[global_j_idx];
+                            // apply the cost on the diagonal
+                            if (global_i_idx == global_j_idx) {
+                                temp[internal_i][internal_j] += cost;
                             }
                         } else {
-                            // calculate the values of alpha * A * B
-                            for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
-                                atomic_ref<real_type>{ C_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx] } += alpha * temp_ij * B_ptr[global_col * (num_classes + PADDING_SIZE_uz) + class_idx];
-                                // symmetry
-                                atomic_ref<real_type>{ C_ptr[global_col * (num_classes + PADDING_SIZE_uz) + class_idx] } += alpha * temp_ij * B_ptr[global_row * (num_classes + PADDING_SIZE_uz) + class_idx];
+                            // be sure to set the value to zero otherwise
+                            temp[internal_i][internal_j] = real_type{ 0.0 };
+                        }
+                    }
+                }
+
+                //*************************************************************************//
+                //                     calculate C += alpha * temp * B                     //
+                //*************************************************************************//
+                for (std::size_t class_block = 0; class_block < num_classes; class_block += THREAD_BLOCK_SIZE_uz) {
+                    for (unsigned internal_i = 0; internal_i < INTERNAL_BLOCK_SIZE; ++internal_i) {
+                        for (unsigned internal_j = 0; internal_j < INTERNAL_BLOCK_SIZE; ++internal_j) {
+                            // calculate the indices to access the global data
+                            const auto global_i_idx = device_row_offset + i_idx + static_cast<std::size_t>(internal_i);
+                            const auto global_j_idx = device_row_offset + j_idx + static_cast<std::size_t>(internal_j);
+
+                            if (global_i_idx == global_j_idx) {
+                                // only apply once to the diagonal
+                                for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                    atomic_ref<real_type>{ C_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx] } += alpha * temp[internal_i][internal_j] * B_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx];
+                                }
+                            } else {
+                                // apply it for the upper and lower triangular matrix
+                                for (std::size_t class_idx = 0; class_idx < THREAD_BLOCK_SIZE_uz; ++class_idx) {
+                                    atomic_ref<real_type>{ C_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx] } += alpha * temp[internal_i][internal_j] * B_ptr[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx];
+                                    // symmetry
+                                    atomic_ref<real_type>{ C_ptr[global_j_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx] } += alpha * temp[internal_i][internal_j] * B_ptr[global_i_idx * (num_classes + PADDING_SIZE_uz) + class_block + class_idx];
+                                }
                             }
                         }
                     }
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 }  // namespace plssvm::stdpar::detail
 
diff --git a/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp b/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp
index e652d1160..b77e7a338 100644
--- a/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp
+++ b/include/plssvm/backends/stdpar/kernel/kernel_functions.hpp
@@ -38,42 +38,17 @@ namespace plssvm::stdpar::detail {
 
 /**
  * @brief Fast integer power function. Computes base^exponent and takes advantage of the fact that degree may only be positive integer values.
- * @details Hardcodes the power function for degree <= 6, uses a simple for loop otherwise.
  * @param[in] base the base
  * @param[in] exponent the exponent
  * @return base^exponent (`[[nodiscard]]`)
  */
 [[nodiscard]] inline PLSSVM_STDPAR_KERNEL_FUNCTION real_type powi(const real_type base, const int exponent) {
-    switch (exponent) {
-        case 0: return real_type{ 1.0 };
-        case 1: return base;
-        case 2: return base * base;
-        case 3: return base * base * base;
-        case 4:
-            {
-                const real_type temp = base * base;
-                return temp * temp;
-            }
-        case 5:
-            {
-                const real_type temp = base * base;
-                return temp * temp * base;
-            }
-        case 6:
-            {
-                const real_type temp = base * base * base;
-                return temp * temp;
-            }
-        default:
-            {
-                // generic integer power function
-                real_type result{ 1.0 };
-                for (int i = 0; i < exponent; ++i) {
-                    result *= base;
-                }
-                return result;
-            }
+    // generic integer power function
+    real_type result{ 1.0 };
+    for (int i = 0; i < exponent; ++i) {
+        result *= base;
     }
+    return result;
 }
 
 //***************************************************//
diff --git a/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp b/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp
index ce46e6a1c..4b487dce2 100644
--- a/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/stdpar/kernel/predict_kernel.hpp
@@ -15,15 +15,16 @@
 
 #include "plssvm/backends/stdpar/detail/utility.hpp"           // plssvm::stdpar::detail::atomic_ref
 #include "plssvm/backends/stdpar/kernel/kernel_functions.hpp"  // plssvm::stdpar::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                // plssvm::{real_type, INTERNAL_BLOCK_SIZE, FEATURE_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/constants.hpp"                                // plssvm::{real_type, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/detail/assert.hpp"                            // PLSSVM_ASSERT
 #include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                                   // plssvm::aos_matrix, plssvm::soa_matrix
 #include "plssvm/shape.hpp"                                    // plssvm::shape
+#include "plssvm/target_platforms.hpp"                         // plssvm::target_platform
 
 #include <algorithm>  // std::for_each
 #include <array>      // std::array
-#include <cmath>      // std::fma
+#include <cmath>      // std::ceil
 #include <cstddef>    // std::size_t
 #include <execution>  // std::execution::par_unseq
 #include <numeric>    // std::iota
@@ -33,230 +34,305 @@ namespace plssvm::stdpar::detail {
 
 /**
  * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
- * @param[out] w the vector to speedup the linear prediction
- * @param[in] alpha the previously learned weights
- * @param[in] support_vectors the support vectors
- * @param[in] device_specific_num_sv the number of support vectors the current device is responsible for
- * @param[in] sv_offset the first row in @p support_vectors the current device is responsible for
+ * @tparam target the target platform
  */
-inline void device_kernel_w_linear(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_specific_num_sv, const std::size_t sv_offset) {
-    PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
-    PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }));
-    PLSSVM_ASSERT(support_vectors.num_rows() >= device_specific_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_specific_num_sv, support_vectors.num_rows());
-    PLSSVM_ASSERT(support_vectors.num_rows() >= sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", sv_offset, support_vectors.num_rows());
-
-    // calculate constants
-    const std::size_t num_classes = alpha.num_rows();
-    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_features = support_vectors.num_cols();
-    const auto blocked_num_features = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_features) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_support_vectors = support_vectors.num_rows();
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_num_classes * blocked_num_features);
-    std::iota(range.begin(), range.end(), 0);
-
-    std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, w_ptr = w.data(), alpha_ptr = alpha.data(), sv_ptr = support_vectors.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t feature = idx / blocked_num_classes;
-        const std::size_t c = idx % blocked_num_classes;
-
-        const std::size_t feature_idx = feature * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz;
-
-        // create a thread private array used for internal caching
-        std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-        // iterate over all features
-        for (std::size_t sv = 0; sv < device_specific_num_sv; ++sv) {
-            // perform the feature reduction calculation
-            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const std::size_t global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
-                    const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-
-                    temp[internal_feature][internal_class] += alpha_ptr[global_class_idx * (num_support_vectors + PADDING_SIZE_uz) + sv_offset + sv] * sv_ptr[global_feature_idx * (num_support_vectors + PADDING_SIZE_uz) + sv_offset + sv];
+template <target_platform target>
+struct device_kernel_w_linear {
+    /**
+     * @brief Calculate the `w` vector used to speedup the prediction using the linear kernel function.
+     * @param[out] w the vector to speedup the linear prediction
+     * @param[in] alpha the previously learned weights
+     * @param[in] support_vectors the support vectors
+     * @param[in] device_num_sv the number of support vectors the current device is responsible for
+     * @param[in] device_sv_offset the first row in @p support_vectors the current device is responsible for
+     */
+    void operator()(soa_matrix<real_type> &w, const aos_matrix<real_type> &alpha, const soa_matrix<real_type> &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) {
+        PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
+        PLSSVM_ASSERT(w.shape() == (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }), "Shape mismatch: {} vs {}!", w.shape(), (plssvm::shape{ alpha.num_rows(), support_vectors.num_cols() }));
+        PLSSVM_ASSERT(support_vectors.num_rows() >= device_num_sv, "The number of place specific sv ({}) cannot be greater the the total number of sv ({})!", device_num_sv, support_vectors.num_rows());
+        PLSSVM_ASSERT(support_vectors.num_rows() >= device_sv_offset, "The sv offset ({}) cannot be greater the the total number of sv ({})!", device_sv_offset, support_vectors.num_rows());
+
+        // calculate constants
+        const std::size_t num_classes = alpha.num_rows();
+        const std::size_t num_features = support_vectors.num_cols();
+        const std::size_t num_sv = support_vectors.num_rows();
+        const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_num_features = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_features) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> range(blocked_num_classes * blocked_num_features);
+        std::iota(range.begin(), range.end(), 0);
+
+        std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, w_ptr = w.data(), alpha_ptr = alpha.data(), support_vectors_ptr = support_vectors.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t feature_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+
+            // create a thread private array used for internal caching
+            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+            // iterate over all support vectors using blocking
+            for (std::size_t sv_block = 0; sv_block < device_num_sv; sv_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the sv is the fastest moving index
+                    for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                                sum += alpha_ptr[global_class_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv] *             // AoS
+                                       support_vectors_ptr[global_feature_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv];  // SoA
+                            }
+                            temp[internal_feature][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the sv is the slowest moving index
+                    for (std::size_t sv = 0; sv < THREAD_BLOCK_SIZE_uz; ++sv) {
+                        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                // calculate the indices to access the global data
+                                const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                                temp[internal_feature][internal_class] += alpha_ptr[global_class_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv] *             // AoS
+                                                                          support_vectors_ptr[global_feature_idx * (num_sv + PADDING_SIZE_uz) + device_sv_offset + sv_block + sv];  // SoA
+                            }
+                        }
+                    }
                 }
             }
-        }
 
-        // update global array with local one
-        for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
-            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const std::size_t global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
-                const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+            // store the result back to the w vector
+            for (unsigned internal_feature = 0; internal_feature < INTERNAL_BLOCK_SIZE; ++internal_feature) {
+                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                    // calculate the indices to access the global data
+                    const auto global_feature_idx = feature_idx + static_cast<std::size_t>(internal_feature);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                w_ptr[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];
+                    w_ptr[global_feature_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_feature][internal_class];
+                }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 /**
- * @brief Predict the @p predict_points_d using the linear kernel speeding up the calculation using the @p w_d vector.
- * @param[out] prediction the predicted values
- * @param[in] w the vector to speedup the calculations
- * @param[in] rho the previously learned bias
- * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
+ * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
+ * @tparam target the target platform
  */
-inline void device_kernel_predict_linear(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset) {
-    PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size());
-    PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols());
-    PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
-
-    // calculate constants
-    const std::size_t num_predict_points = predict_points.num_rows();
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_classes = prediction.num_cols();
-    const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_features = predict_points.num_cols();
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_device_specific_num_predict_points * blocked_num_classes);
-    std::iota(range.begin(), range.end(), 0);
-
-    std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), w_ptr = w.data(), rho_ptr = rho.data(), pp_ptr = predict_points.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t pp = idx / blocked_num_classes;
-        const std::size_t c = idx % blocked_num_classes;
-
-        const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t class_idx = c * INTERNAL_BLOCK_SIZE_uz;
-
-        // create a thread private array used for internal caching
-        std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-        // iterate over all features
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
-            // perform the feature reduction calculation
-            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
-                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
-
-                    temp[internal_pp][internal_class] += w_ptr[dim * (num_classes + PADDING_SIZE_uz) + global_class_idx] * pp_ptr[dim * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx];
+template <target_platform target>
+struct device_kernel_predict_linear {
+    /**
+     * @brief Predict the @p predict_points using the linear kernel speeding up the calculation using the @p w vector.
+     * @param[out] prediction the predicted values
+     * @param[in] w the vector to speedup the calculations
+     * @param[in] rho the previously learned bias
+     * @param[in] predict_points the data points to predict
+     * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+     * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
+     */
+    void operator()(aos_matrix<real_type> &prediction, const soa_matrix<real_type> &w, const std::vector<real_type> &rho, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) {
+        PLSSVM_ASSERT(w.num_rows() == rho.size(), "Size mismatch: {} vs {}!", w.num_rows(), rho.size());
+        PLSSVM_ASSERT(w.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", w.num_cols(), predict_points.num_cols());
+        PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), w.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), w.num_rows() }));
+        PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+        PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
+
+        // calculate constants
+        const std::size_t num_predict_points = predict_points.num_rows();
+        const std::size_t num_classes = prediction.num_cols();
+        const std::size_t num_features = predict_points.num_cols();
+        const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_num_classes = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_classes) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> range(blocked_device_num_predict_points * blocked_num_classes);
+        std::iota(range.begin(), range.end(), 0);
+
+        std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), w_ptr = w.data(), rho_ptr = rho.data(), predict_points_ptr = predict_points.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t pp_idx = (idx / blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+            const std::size_t class_idx = (idx % blocked_num_classes) * INTERNAL_BLOCK_SIZE_uz;
+
+            // create a thread private array used for internal caching
+            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+            // iterate over all features using blocking
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the dot product calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                            // calculate the indices to access the global data
+                            const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                            const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                sum += w_ptr[(feature_block + feature) * (num_classes + PADDING_SIZE_uz) + global_class_idx] *                  // SoA
+                                       predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx];  // SoA
+                            }
+                            temp[internal_pp][internal_class] += sum;
+                        }
+                    }
+                } else {
+                    // perform the dot product calculation, the feature is the slowest moving index
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                                // calculate the indices to access the global data
+                                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                                const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+
+                                temp[internal_pp][internal_class] += w_ptr[(feature_block + feature) * (num_classes + PADDING_SIZE_uz) + global_class_idx] *                  // SoA
+                                                                     predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx];  // SoA
+                            }
+                        }
+                    }
                 }
             }
-        }
 
-        // perform the dot product calculation
-        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
-            for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
-                const std::size_t device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                const std::size_t global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
+            // update the global array with the local one
+            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                for (unsigned internal_class = 0; internal_class < INTERNAL_BLOCK_SIZE; ++internal_class) {
+                    // calculate the indices to access the global data
+                    const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_class_idx = class_idx + static_cast<std::size_t>(internal_class);
 
-                if (device_global_pp_idx < device_specific_num_predict_points && global_class_idx < num_classes) {
                     prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + global_class_idx] = temp[internal_pp][internal_class] - rho_ptr[global_class_idx];
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 /**
- * @brief Predict the @p predict_points_d using the @p kernel_function.
- * @tparam kernel the type of the used kernel function
+ * @brief Predict the @p predict_points using the @p kernel_function.
+ * @tparam target the target platform
+ * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
- * @param[out] prediction the predicted values
- * @param[in] alpha the previously learned weights
- * @param[in] rho the previously learned bias
- * @param[in] support_vectors the support vectors
- * @param[in] predict_points the data points to predict
- * @param[in] device_specific_num_predict_points the number of predict points the current device is responsible for
- * @param[in] row_offset the first row in @p predict_points the current device is responsible for
- * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
  */
-template <kernel_function_type kernel, typename... Args>
-inline void device_kernel_predict(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_specific_num_predict_points, const std::size_t row_offset, Args... kernel_function_parameter) {
-    PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size());
-    PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
-    PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols());
-    PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }));
-    PLSSVM_ASSERT(predict_points.num_rows() >= device_specific_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_specific_num_predict_points, predict_points.num_rows());
-    PLSSVM_ASSERT(predict_points.num_rows() >= row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", row_offset, predict_points.num_rows());
-
-    // calculate constants
-    const std::size_t num_classes = alpha.num_rows();
-    const std::size_t num_support_vectors = support_vectors.num_rows();
-    const auto blocked_num_support_vectors = static_cast<std::size_t>(std::ceil(static_cast<real_type>(num_support_vectors) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_predict_points = predict_points.num_rows();
-    const auto blocked_device_specific_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_specific_num_predict_points) / INTERNAL_BLOCK_SIZE));
-    const std::size_t num_features = predict_points.num_cols();
-
-    // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
-    const auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
-    const auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
-
-    // define range over which should be iterated
-    std::vector<std::size_t> range(blocked_device_specific_num_predict_points * blocked_num_support_vectors);
-    std::iota(range.begin(), range.end(), 0);
-
-    std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), alpha_ptr = alpha.data(), rho_ptr = rho.data(), sv_ptr = support_vectors.data(), pp_ptr = predict_points.data()](const std::size_t idx) {
-        // calculate the indices used in the current thread
-        const std::size_t pp = idx / blocked_num_support_vectors;
-        const std::size_t sv = idx % blocked_num_support_vectors;
-
-        const std::size_t pp_idx = pp * INTERNAL_BLOCK_SIZE_uz;
-        const std::size_t sv_idx = sv * INTERNAL_BLOCK_SIZE_uz;
-
-        // create a thread private array used for internal caching
-        std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
-
-        // iterate over all features
-        for (std::size_t dim = 0; dim < num_features; ++dim) {
-            // perform the feature reduction calculation
-            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
-                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
-
-                    temp[internal_pp][internal_sv] += detail::feature_reduce<kernel>(sv_ptr[dim * (num_support_vectors + PADDING_SIZE_uz) + global_sv_idx],
-                                                                                     pp_ptr[dim * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]);
+template <target_platform target, kernel_function_type kernel_function, typename... Args>
+struct device_kernel_predict {
+    /**
+     * @brief Predict the @p predict_points using the kernel function.
+     * @param[out] prediction the predicted values
+     * @param[in] alpha the previously learned weights
+     * @param[in] rho the previously learned bias
+     * @param[in] support_vectors the support vectors
+     * @param[in] predict_points the data points to predict
+     * @param[in] device_num_predict_points the number of predict points the current device is responsible for
+     * @param[in] device_row_offset the first row in @p predict_points the current device is responsible for
+     * @param[in] kernel_function_parameter the parameters necessary to apply the kernel function
+     */
+    void operator()(aos_matrix<real_type> &prediction, const aos_matrix<real_type> &alpha, const std::vector<real_type> &rho, const soa_matrix<real_type> &support_vectors, const soa_matrix<real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) {
+        PLSSVM_ASSERT(alpha.num_rows() == rho.size(), "Size mismatch: {} vs {}!", alpha.num_rows(), rho.size());
+        PLSSVM_ASSERT(alpha.num_cols() == support_vectors.num_rows(), "Size mismatch: {} vs {}!", alpha.num_cols(), support_vectors.num_rows());
+        PLSSVM_ASSERT(support_vectors.num_cols() == predict_points.num_cols(), "Size mismatch: {} vs {}!", support_vectors.num_cols(), predict_points.num_cols());
+        PLSSVM_ASSERT(prediction.shape() == (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }), "Shape mismatch: {} vs {}!", prediction.shape(), (plssvm::shape{ predict_points.num_rows(), alpha.num_rows() }));
+        PLSSVM_ASSERT(predict_points.num_rows() >= device_num_predict_points, "The number of place specific predict points ({}) cannot be greater the the total number of predict points ({})!", device_num_predict_points, predict_points.num_rows());
+        PLSSVM_ASSERT(predict_points.num_rows() >= device_row_offset, "The row offset ({}) cannot be greater the the total number of predict points ({})!", device_row_offset, predict_points.num_rows());
+
+        // calculate constants
+        const std::size_t num_classes = alpha.num_rows();
+        const std::size_t device_num_sv = support_vectors.num_rows();
+        const std::size_t num_features = predict_points.num_cols();
+        const std::size_t num_predict_points = predict_points.num_rows();
+        const auto blocked_device_num_sv = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_sv) / INTERNAL_BLOCK_SIZE));
+        const auto blocked_device_num_predict_points = static_cast<std::size_t>(std::ceil(static_cast<real_type>(device_num_predict_points) / INTERNAL_BLOCK_SIZE));
+
+        // cast all values to 64-bit unsigned long long to prevent potential 32-bit overflows
+        constexpr auto INTERNAL_BLOCK_SIZE_uz = static_cast<std::size_t>(INTERNAL_BLOCK_SIZE);
+        constexpr auto THREAD_BLOCK_SIZE_uz = static_cast<std::size_t>(THREAD_BLOCK_SIZE);
+        constexpr auto PADDING_SIZE_uz = static_cast<std::size_t>(PADDING_SIZE);
+
+        // define the range over which should be iterated
+        std::vector<std::size_t> range(blocked_device_num_predict_points * blocked_device_num_sv);
+        std::iota(range.begin(), range.end(), 0);
+
+        std::for_each(std::execution::par_unseq, range.begin(), range.end(), [=, prediction_ptr = prediction.data(), alpha_ptr = alpha.data(), rho_ptr = rho.data(), support_vectors_ptr = support_vectors.data(), predict_points_ptr = predict_points.data()](const std::size_t idx) {
+            // calculate the indices used in the current thread
+            const std::size_t pp_idx = (idx / blocked_device_num_sv) * INTERNAL_BLOCK_SIZE_uz;  // num_predict_points
+            const std::size_t sv_idx = (idx % blocked_device_num_sv) * INTERNAL_BLOCK_SIZE_uz;  // num_support_vectors
+
+            // create a thread private array used for internal caching
+            std::array<std::array<real_type, INTERNAL_BLOCK_SIZE>, INTERNAL_BLOCK_SIZE> temp{};
+
+            // iterate over all features
+            for (std::size_t feature_block = 0; feature_block < num_features; feature_block += THREAD_BLOCK_SIZE_uz) {
+                if constexpr (target == target_platform::cpu) {
+                    // perform the feature reduction calculation, the feature is the fastest moving index
+                    for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                        for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                            // calculate the indices to access the global data
+                            const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                            const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+
+                            real_type sum{ 0.0 };
+                            for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                                sum += detail::feature_reduce<kernel_function>(support_vectors_ptr[(feature_block + feature) * (device_num_sv + PADDING_SIZE_uz) + global_sv_idx],       // SoA
+                                                                               predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]);  // SoA
+                            }
+                            temp[internal_pp][internal_sv] += sum;
+                        }
+                    }
+                } else {
+                    // perform the feature reduction calculation, the feature is the slowest moving index
+                    for (std::size_t feature = 0; feature < THREAD_BLOCK_SIZE_uz; ++feature) {
+                        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                                // calculate the indices to access the global data
+                                const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                                const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+
+                                temp[internal_pp][internal_sv] += detail::feature_reduce<kernel_function>(support_vectors_ptr[(feature_block + feature) * (device_num_sv + PADDING_SIZE_uz) + global_sv_idx],       // SoA
+                                                                                                          predict_points_ptr[(feature_block + feature) * (num_predict_points + PADDING_SIZE_uz) + global_pp_idx]);  // SoA
+                            }
+                        }
+                    }
                 }
             }
-        }
 
-        // update temp using the respective kernel function
-        for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
-            for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel>(temp[internal_pp][internal_sv], kernel_function_parameter...);
+            // update temp using the respective kernel function
+            for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
+                for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
+                    temp[internal_pp][internal_sv] = detail::apply_kernel_function<kernel_function>(temp[internal_pp][internal_sv], kernel_function_parameter...);
+                }
             }
-        }
 
-        // add results to prediction
-        for (std::size_t a = 0; a < num_classes; ++a) {
+            // atomically add the results to the prediction
             for (unsigned internal_pp = 0; internal_pp < INTERNAL_BLOCK_SIZE; ++internal_pp) {
                 for (unsigned internal_sv = 0; internal_sv < INTERNAL_BLOCK_SIZE; ++internal_sv) {
-                    const std::size_t device_global_pp_idx = pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_pp_idx = row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
-                    const std::size_t global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
+                    // calculate the indices to access the global data and the data with respect to the current device
+                    const auto global_pp_idx = device_row_offset + pp_idx + static_cast<std::size_t>(internal_pp);
+                    const auto global_sv_idx = sv_idx + static_cast<std::size_t>(internal_sv);
 
-                    // be sure to not perform out of bounds accesses
-                    if (device_global_pp_idx < device_specific_num_predict_points && global_sv_idx < num_support_vectors) {
+                    for (std::size_t class_idx = 0; class_idx < num_classes; ++class_idx) {
                         if (global_sv_idx == 0) {
-                            atomic_ref<real_type>{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + a] } += -rho_ptr[a];
+                            atomic_ref<real_type>{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_idx] } += -rho_ptr[class_idx];
                         }
-                        atomic_ref<real_type>{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + a] } +=
-                            temp[internal_pp][internal_sv] * alpha_ptr[a * (num_support_vectors + PADDING_SIZE_uz) + global_sv_idx];
+                        atomic_ref<real_type>{ prediction_ptr[global_pp_idx * (num_classes + PADDING_SIZE_uz) + class_idx] } +=
+                            temp[internal_pp][internal_sv] * alpha_ptr[class_idx * (device_num_sv + PADDING_SIZE_uz) + global_sv_idx];
                     }
                 }
             }
-        }
-    });
-}
+        });
+    }
+};
 
 }  // namespace plssvm::stdpar::detail
 
diff --git a/src/plssvm/backends/stdpar/csvm.cpp b/src/plssvm/backends/stdpar/csvm.cpp
index aa7b32a0f..783679121 100644
--- a/src/plssvm/backends/stdpar/csvm.cpp
+++ b/src/plssvm/backends/stdpar/csvm.cpp
@@ -8,6 +8,7 @@
 
 #include "plssvm/backends/stdpar/csvm.hpp"
 
+#include "plssvm/backends/stdpar/exceptions.hpp"                                      // plssvm::stdpar::backend_exception
 #include "plssvm/backends/stdpar/kernel/cg_explicit/blas.hpp"                         // plssvm::stdpar::detail::device_kernel_symm
 #include "plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::stdpar::detail::device_kernel_assembly
 #include "plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::stdpar::detail::device_kernel_assembly_symm
@@ -15,6 +16,7 @@
 #include "plssvm/constants.hpp"                                                       // plssvm::real_type
 #include "plssvm/detail/assert.hpp"                                                   // PLSSVM_ASSERT
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
+#include "plssvm/detail/make_unique_for_overwrite.hpp"                                // plssvm::detail::{make_unique_for_overwrite, parallel_zero_memset}
 #include "plssvm/detail/memory_size.hpp"                                              // plssvm::detail::memory_size
 #include "plssvm/detail/move_only_any.hpp"                                            // plssvm::detail::{move_only_any, move_only_any_cast}
 #include "plssvm/detail/tracking/performance_tracker.hpp"                             // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY
@@ -30,9 +32,111 @@
 #include <chrono>   // std::chrono::{steady_clock, duration_cast}
 #include <cstddef>  // std::size_t
 #include <tuple>    // std::tuple, std::make_tuple
-#include <utility>  // std::move
+#include <utility>  // std::move, std::forward
 #include <vector>   // std::vector
 
+namespace {
+
+/**
+ * @brief Run the kernel functor on the given device.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <typename KernelFunctor, typename... Args>
+void run_kernel_functor(Args &&...args) {
+    KernelFunctor{}(std::forward<Args>(args)...);
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam target the target platform to run the kernel on
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, plssvm::target_platform target, typename... Args>
+void dispatch_kernel_function_type(const plssvm::parameter &params, Args &&...args) {
+    switch (params.kernel_type) {
+        case plssvm::kernel_function_type::linear:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::linear>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::kernel_function_type::polynomial:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::rbf:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::rbf, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::sigmoid:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma), params.coef0);
+            break;
+        case plssvm::kernel_function_type::laplacian:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::laplacian, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+        case plssvm::kernel_function_type::chi_squared:
+            run_kernel_functor<KernelFunctor<target, plssvm::kernel_function_type::chi_squared, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch KernelFunctor kernel functor to the correct target platform and kernel function type.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] params the parameters used to determine the kernel function type
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, const plssvm::parameter &params, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::stdpar::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_nvidia>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_amd>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::gpu_intel>(params, std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            dispatch_kernel_function_type<KernelFunctor, plssvm::target_platform::cpu>(params, std::forward<Args>(args)...);
+            break;
+    }
+}
+
+/**
+ * @brief Dispatch the kernel functor to the correct target platform.
+ * @tparam KernelFunctor the type of the kernel functor to run
+ * @tparam Args the types of the parameters necessary for the specific kernel functor
+ * @param[in] target the target platform to run the kernel on
+ * @param[in] args the parameters necessary for the specific kernel functor
+ */
+template <template <plssvm::target_platform, typename...> typename KernelFunctor, typename... Args>
+void dispatch_target_platform(const plssvm::target_platform target, Args &&...args) {
+    switch (target) {
+        case plssvm::target_platform::automatic:
+            throw plssvm::stdpar::backend_exception{ "Can't determine the target platform!" };
+        case plssvm::target_platform::gpu_nvidia:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_nvidia>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_amd>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::gpu_intel>>(std::forward<Args>(args)...);
+            break;
+        case plssvm::target_platform::cpu:
+            run_kernel_functor<KernelFunctor<plssvm::target_platform::cpu>>(std::forward<Args>(args)...);
+            break;
+    }
+}
+
+}  // namespace
+
 namespace plssvm::stdpar {
 
 csvm::~csvm() = default;
@@ -78,28 +182,16 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
                     // get the offset of the data points this device is responsible for
                     const std::size_t row_offset = dist.place_row_offset(0);
 
-                    std::vector<real_type> kernel_matrix(dist.calculate_explicit_kernel_matrix_num_entries_padded(0));  // only explicitly store the upper triangular matrix
+                    // get the number of kernel matrix entries
+                    const std::size_t num_entries = dist.calculate_explicit_kernel_matrix_num_entries_padded(0);
+
+                    // only explicitly store the upper triangular matrix
+                    auto kernel_matrix = ::plssvm::detail::make_unique_for_overwrite<real_type[]>(num_entries);
+                    // initialize kernel matrix to all zeros in parallel
+                    ::plssvm::detail::parallel_zero_memset(kernel_matrix.get(), num_entries);
+
                     const auto start = std::chrono::steady_clock::now();
-                    switch (params.kernel_type) {
-                        case kernel_function_type::linear:
-                            detail::device_kernel_assembly<kernel_function_type::linear>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
-                            break;
-                        case kernel_function_type::polynomial:
-                            detail::device_kernel_assembly<kernel_function_type::polynomial>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<real_type>(params.gamma), params.coef0);
-                            break;
-                        case kernel_function_type::rbf:
-                            detail::device_kernel_assembly<kernel_function_type::rbf>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
-                            break;
-                        case kernel_function_type::sigmoid:
-                            detail::device_kernel_assembly<kernel_function_type::sigmoid>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma), params.coef0);
-                            break;
-                        case kernel_function_type::laplacian:
-                            detail::device_kernel_assembly<kernel_function_type::laplacian>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
-                            break;
-                        case kernel_function_type::chi_squared:
-                            detail::device_kernel_assembly<kernel_function_type::chi_squared>(kernel_matrix, A, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<real_type>(params.gamma));
-                            break;
-                    }
+                    dispatch_target_platform<detail::device_kernel_assembly>(target_, params, kernel_matrix.get(), A, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
                     const auto end = std::chrono::steady_clock::now();
                     [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "kernel_matrix", "kernel_matrix_assembly_kernel", duration }));
@@ -155,16 +247,16 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                 break;
             case solver_type::cg_explicit:
                 {
-                    const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::vector<real_type> &>(A.front());
+                    const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::unique_ptr<real_type[]> &>(A.front());
                     PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!");
 
                     const auto start = std::chrono::steady_clock::now();
 
-                    detail::device_kernel_symm(num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                    dispatch_target_platform<detail::device_kernel_symm>(target_, num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
 
                     const std::size_t num_mirror_rows = num_rows - row_offset - device_specific_num_rows;
                     if (num_mirror_rows > std::size_t{ 0 }) {
-                        detail::device_kernel_symm_mirror(num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A, B, beta, C);
+                        dispatch_target_platform<detail::device_kernel_symm_mirror>(target_, num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, explicit_A.get(), B, beta, C);
                     }
 
                     const auto end = std::chrono::steady_clock::now();
@@ -186,26 +278,7 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                     }
 
                     const auto start = std::chrono::steady_clock::now();
-                    switch (params.kernel_type) {
-                        case kernel_function_type::linear:
-                            detail::device_kernel_assembly_symm<kernel_function_type::linear>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C);
-                            break;
-                        case kernel_function_type::polynomial:
-                            detail::device_kernel_assembly_symm<kernel_function_type::polynomial>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C, params.degree, std::get<real_type>(params.gamma), params.coef0);
-                            break;
-                        case kernel_function_type::rbf:
-                            detail::device_kernel_assembly_symm<kernel_function_type::rbf>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<real_type>(params.gamma));
-                            break;
-                        case kernel_function_type::sigmoid:
-                            detail::device_kernel_assembly_symm<kernel_function_type::sigmoid>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<real_type>(params.gamma), params.coef0);
-                            break;
-                        case kernel_function_type::laplacian:
-                            detail::device_kernel_assembly_symm<kernel_function_type::laplacian>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<real_type>(params.gamma));
-                            break;
-                        case kernel_function_type::chi_squared:
-                            detail::device_kernel_assembly_symm<kernel_function_type::chi_squared>(alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<real_type>(params.gamma));
-                            break;
-                    }
+                    dispatch_target_platform<detail::device_kernel_assembly_symm>(target_, params, alpha, q_red, matr_A, device_specific_num_rows, row_offset, QA_cost, cost, B, C);
                     const auto end = std::chrono::steady_clock::now();
                     [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "cg", "blas_level_3_times_kernel", duration }));
@@ -213,6 +286,8 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                 break;
         }
     }
+    // restore padding entries by setting them to zero
+    C.restore_padding();
 }
 
 //***************************************************//
@@ -262,12 +337,14 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
 
                 const auto start = std::chrono::steady_clock::now();
 
-                detail::device_kernel_w_linear(w, alpha, support_vectors, device_specific_num_sv, sv_offset);
+                dispatch_target_platform<detail::device_kernel_w_linear>(target_, w, alpha, support_vectors, device_specific_num_sv, sv_offset);
 
                 const auto end = std::chrono::steady_clock::now();
                 [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "w_kernel", duration }));
             }
+            // restore padding entries by setting them to zero
+            w.restore_padding();
 
             // reduce w on all MPI ranks
             comm_.allreduce_inplace(w);
@@ -281,32 +358,17 @@ aos_matrix<real_type> csvm::predict_values(const parameter &params,
     if (data_distribution_->place_specific_num_rows(0) > std::size_t{ 0 }) {
         const auto start = std::chrono::steady_clock::now();
         // call the predict kernels
-        switch (params.kernel_type) {
-            case kernel_function_type::linear:
-                // predict the values using the w vector
-                detail::device_kernel_predict_linear(out, w, rho, predict_points, device_specific_num_predict_points, row_offset);
-                break;
-            case kernel_function_type::polynomial:
-                detail::device_kernel_predict<kernel_function_type::polynomial>(out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset, params.degree, std::get<real_type>(params.gamma), params.coef0);
-                break;
-            case kernel_function_type::rbf:
-                detail::device_kernel_predict<kernel_function_type::rbf>(out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset, std::get<real_type>(params.gamma));
-                break;
-            case kernel_function_type::sigmoid:
-                detail::device_kernel_predict<kernel_function_type::sigmoid>(out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset, std::get<real_type>(params.gamma), params.coef0);
-                break;
-            case kernel_function_type::laplacian:
-                detail::device_kernel_predict<kernel_function_type::laplacian>(out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset, std::get<real_type>(params.gamma));
-                break;
-            case kernel_function_type::chi_squared:
-                detail::device_kernel_predict<kernel_function_type::chi_squared>(out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset, std::get<real_type>(params.gamma));
-                break;
+        if (params.kernel_type == kernel_function_type::linear) {
+            dispatch_target_platform<detail::device_kernel_predict_linear>(target_, out, w, rho, predict_points, device_specific_num_predict_points, row_offset);
+        } else {
+            dispatch_target_platform<detail::device_kernel_predict>(target_, params, out, alpha, rho, support_vectors, predict_points, device_specific_num_predict_points, row_offset);
         }
         const auto end = std::chrono::steady_clock::now();
         [[maybe_unused]] const auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "predict_values", "predict_kernel", duration }));
     }
-
+    // restore padding entries by setting them to zero
+    out.restore_padding();
     return out;
 }
 

From ee08eaa7d47e318c5f76cab9b8c0e3f6f14ebfcc Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 5 Jul 2025 18:39:53 +0200
Subject: [PATCH 71/93] Fix stdpar tests after changing the kernel function
 interface (from free function to function object).

---
 tests/backends/generic_csvm_tests.hpp |  60 +++++-----
 tests/backends/stdpar/stdpar_csvm.cpp | 155 ++++++++++++++++++++++++--
 2 files changed, 176 insertions(+), 39 deletions(-)

diff --git a/tests/backends/generic_csvm_tests.hpp b/tests/backends/generic_csvm_tests.hpp
index 549cd3a68..ba90816aa 100644
--- a/tests/backends/generic_csvm_tests.hpp
+++ b/tests/backends/generic_csvm_tests.hpp
@@ -165,19 +165,19 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, assemble_kernel_matrix_explicit)
             device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost);
             break;
         case plssvm::kernel_function_type::polynomial:
-            device_kernel_assembly<plssvm::kernel_function_type::polynomial>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_assembly<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::rbf:
-            device_kernel_assembly<plssvm::kernel_function_type::rbf>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly<plssvm::kernel_function_type::rbf, plssvm::real_type>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::sigmoid:
-            device_kernel_assembly<plssvm::kernel_function_type::sigmoid>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_assembly<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::laplacian:
-            device_kernel_assembly<plssvm::kernel_function_type::laplacian>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly<plssvm::kernel_function_type::laplacian, plssvm::real_type>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::chi_squared:
-            device_kernel_assembly<plssvm::kernel_function_type::chi_squared>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(kernel_matrix.data(), data_matr, device_specific_num_rows, row_offset, q_red, QA_cost, cost, std::get<plssvm::real_type>(params.gamma));
             break;
     }
     const std::vector<plssvm::real_type> correct_kernel_matrix = ground_truth::assemble_device_specific_kernel_matrix(params, data_matr, q_red, QA_cost, dist, 0);
@@ -228,19 +228,19 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, blas_level_3_kernel_implicit) {
             device_kernel_assembly_symm<plssvm::kernel_function_type::linear>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C);
             break;
         case plssvm::kernel_function_type::polynomial:
-            device_kernel_assembly_symm<plssvm::kernel_function_type::polynomial>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_assembly_symm<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::rbf:
-            device_kernel_assembly_symm<plssvm::kernel_function_type::rbf>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly_symm<plssvm::kernel_function_type::rbf, plssvm::real_type>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::sigmoid:
-            device_kernel_assembly_symm<plssvm::kernel_function_type::sigmoid>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_assembly_symm<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::laplacian:
-            device_kernel_assembly_symm<plssvm::kernel_function_type::laplacian>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly_symm<plssvm::kernel_function_type::laplacian, plssvm::real_type>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::chi_squared:
-            device_kernel_assembly_symm<plssvm::kernel_function_type::chi_squared>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_assembly_symm<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(alpha, q_red, data_matr, device_specific_num_rows, row_offset, QA_cost, cost, B, C, std::get<plssvm::real_type>(params.gamma));
             break;
     }
 
@@ -284,19 +284,19 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunction, predict_values) {
             device_kernel_predict_linear(out, correct_w, rho, predict_points, device_specific_num_predict_points, row_offset);
             break;
         case plssvm::kernel_function_type::polynomial:
-            device_kernel_predict<plssvm::kernel_function_type::polynomial>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_predict<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::rbf:
-            device_kernel_predict<plssvm::kernel_function_type::rbf>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_predict<plssvm::kernel_function_type::rbf, plssvm::real_type>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::sigmoid:
-            device_kernel_predict<plssvm::kernel_function_type::sigmoid>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            device_kernel_predict<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::laplacian:
-            device_kernel_predict<plssvm::kernel_function_type::laplacian>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_predict<plssvm::kernel_function_type::laplacian, plssvm::real_type>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::chi_squared:
-            device_kernel_predict<plssvm::kernel_function_type::chi_squared>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
+            device_kernel_predict<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(out, weights, rho, data_matr, predict_points, device_specific_num_predict_points, row_offset, std::get<plssvm::real_type>(params.gamma));
             break;
     }
     out.restore_padding();
@@ -445,19 +445,19 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, assemble_kernel_matrix_e
                 device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost);
                 break;
             case plssvm::kernel_function_type::polynomial:
-                device_kernel_assembly<plssvm::kernel_function_type::polynomial>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::rbf:
-                device_kernel_assembly<plssvm::kernel_function_type::rbf>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::rbf, plssvm::real_type>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::sigmoid:
-                device_kernel_assembly<plssvm::kernel_function_type::sigmoid>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::laplacian:
-                device_kernel_assembly<plssvm::kernel_function_type::laplacian>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::laplacian, plssvm::real_type>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::chi_squared:
-                device_kernel_assembly<plssvm::kernel_function_type::chi_squared>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
         }
     };
@@ -514,19 +514,19 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, blas_level_3_kernel_impl
                 device_kernel_assembly_symm<plssvm::kernel_function_type::linear>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p);
                 break;
             case plssvm::kernel_function_type::polynomial:
-                device_kernel_assembly_symm<plssvm::kernel_function_type::polynomial>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly_symm<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::rbf:
-                device_kernel_assembly_symm<plssvm::kernel_function_type::rbf>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly_symm<plssvm::kernel_function_type::rbf, plssvm::real_type>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::sigmoid:
-                device_kernel_assembly_symm<plssvm::kernel_function_type::sigmoid>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly_symm<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::laplacian:
-                device_kernel_assembly_symm<plssvm::kernel_function_type::laplacian>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly_symm<plssvm::kernel_function_type::laplacian, plssvm::real_type>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::chi_squared:
-                device_kernel_assembly_symm<plssvm::kernel_function_type::chi_squared>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly_symm<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(alpha, q_red_p, data_p, device_specific_num_rows_p, row_offset_p, QA_cost, params_p.cost, B_p, C_p, std::get<plssvm::real_type>(params_p.gamma));
                 break;
         }
     };
@@ -608,19 +608,19 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, predict_values) {
                     // unreachable
                     break;
                 case plssvm::kernel_function_type::polynomial:
-                    device_kernel_predict<plssvm::kernel_function_type::polynomial>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                    device_kernel_predict<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                     break;
                 case plssvm::kernel_function_type::rbf:
-                    device_kernel_predict<plssvm::kernel_function_type::rbf>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
+                    device_kernel_predict<plssvm::kernel_function_type::rbf, plssvm::real_type>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
                     break;
                 case plssvm::kernel_function_type::sigmoid:
-                    device_kernel_predict<plssvm::kernel_function_type::sigmoid>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                    device_kernel_predict<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                     break;
                 case plssvm::kernel_function_type::laplacian:
-                    device_kernel_predict<plssvm::kernel_function_type::laplacian>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
+                    device_kernel_predict<plssvm::kernel_function_type::laplacian, plssvm::real_type>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
                     break;
                 case plssvm::kernel_function_type::chi_squared:
-                    device_kernel_predict<plssvm::kernel_function_type::chi_squared>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
+                    device_kernel_predict<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(out_p, weights_p, rho_p, support_vectors_p, predict_points_p, device_specific_num_predict_points_p, row_offset_p, std::get<plssvm::real_type>(params_p.gamma));
                     break;
             }
         };
diff --git a/tests/backends/stdpar/stdpar_csvm.cpp b/tests/backends/stdpar/stdpar_csvm.cpp
index 83c881229..06f9cc81c 100644
--- a/tests/backends/stdpar/stdpar_csvm.cpp
+++ b/tests/backends/stdpar/stdpar_csvm.cpp
@@ -15,13 +15,13 @@
 #include "plssvm/backends/stdpar/kernel/cg_explicit/kernel_matrix_assembly.hpp"       // plssvm::stdpar::device_kernel_assembly
 #include "plssvm/backends/stdpar/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp"  // plssvm::stdpar::device_kernel_assembly_symm
 #include "plssvm/backends/stdpar/kernel/predict_kernel.hpp"                           // plssvm::stdpar::{device_kernel_w_linear, device_kernel_predict_linear, device_kernel_predict}
-#include "plssvm/constants.hpp"                                                       // plssvm::PADDING_SIZE
+#include "plssvm/constants.hpp"                                                       // plssvm::real_type
 #include "plssvm/data_set/classification_data_set.hpp"                                // plssvm::classification_data_set
 #include "plssvm/detail/arithmetic_type_name.hpp"                                     // plssvm::detail::arithmetic_type_name
 #include "plssvm/detail/data_distribution.hpp"                                        // plssvm::detail::triangular_data_distribution
 #include "plssvm/detail/type_list.hpp"                                                // plssvm::detail::supported_label_types
 #include "plssvm/kernel_function_types.hpp"                                           // plssvm::kernel_function_type
-#include "plssvm/matrix.hpp"                                                          // plssvm::soa_matrix
+#include "plssvm/matrix.hpp"                                                          // plssvm::soa_matrix. plssvm::aos_matrix
 #include "plssvm/parameter.hpp"                                                       // plssvm::parameter, plssvm::detail::parameter, plssvm::kernel_type, plssvm::cost
 #include "plssvm/shape.hpp"                                                           // plssvm::shape
 #include "plssvm/target_platforms.hpp"                                                // plssvm::target_platform
@@ -93,14 +93,151 @@ INSTANTIATE_TYPED_TEST_SUITE_P(stdparCSVMDeathTest, GenericCSVMSolverDeathTest,
 INSTANTIATE_TYPED_TEST_SUITE_P(stdparCSVMDeathTest, GenericCSVMKernelFunctionDeathTest, stdpar_kernel_function_type_gtest, naming::test_parameter_to_name);
 INSTANTIATE_TYPED_TEST_SUITE_P(stdparCSVMDeathTest, GenericCSVMSolverKernelFunctionDeathTest, stdpar_solver_and_kernel_function_type_gtest, naming::test_parameter_to_name);
 
+// TODO: better without that much code cuplication
 // define the exact functions to be used in the generic header
-using plssvm::stdpar::detail::device_kernel_assembly;
-using plssvm::stdpar::detail::device_kernel_assembly_symm;
-using plssvm::stdpar::detail::device_kernel_predict;
-using plssvm::stdpar::detail::device_kernel_predict_linear;
-using plssvm::stdpar::detail::device_kernel_symm;
-using plssvm::stdpar::detail::device_kernel_symm_mirror;
-using plssvm::stdpar::detail::device_kernel_w_linear;
+template <plssvm::kernel_function_type kernel_function, typename... Args>
+void device_kernel_assembly(plssvm::real_type *kernel_matrix, const plssvm::soa_matrix<plssvm::real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::vector<plssvm::real_type> &q, const plssvm::real_type QA_cost, const plssvm::real_type cost, Args... kernel_function_parameter) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_assembly<plssvm::target_platform::gpu_nvidia, kernel_function, Args...>{}(kernel_matrix, data, device_num_rows, device_row_offset, q, QA_cost, cost, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_assembly<plssvm::target_platform::gpu_amd, kernel_function, Args...>{}(kernel_matrix, data, device_num_rows, device_row_offset, q, QA_cost, cost, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_assembly<plssvm::target_platform::gpu_intel, kernel_function, Args...>{}(kernel_matrix, data, device_num_rows, device_row_offset, q, QA_cost, cost, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_assembly<plssvm::target_platform::cpu, kernel_function, Args...>{}(kernel_matrix, data, device_num_rows, device_row_offset, q, QA_cost, cost, kernel_function_parameter...);
+            break;
+    }
+}
+
+void device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const plssvm::real_type alpha, const plssvm::real_type *A, const plssvm::soa_matrix<plssvm::real_type> &B, const plssvm::real_type beta, plssvm::soa_matrix<plssvm::real_type> &C) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_symm<plssvm::target_platform::gpu_nvidia>{}(num_rows, num_rhs, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_symm<plssvm::target_platform::gpu_amd>{}(num_rows, num_rhs, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_symm<plssvm::target_platform::gpu_intel>{}(num_rows, num_rhs, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_symm<plssvm::target_platform::cpu>{}(num_rows, num_rhs, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+    }
+}
+
+void device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const plssvm::real_type alpha, const plssvm::real_type *A, const plssvm::soa_matrix<plssvm::real_type> &B, const plssvm::real_type beta, plssvm::soa_matrix<plssvm::real_type> &C) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_symm_mirror<plssvm::target_platform::gpu_nvidia>{}(num_rows, num_rhs, num_mirror_rows, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_symm_mirror<plssvm::target_platform::gpu_amd>{}(num_rows, num_rhs, num_mirror_rows, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_symm_mirror<plssvm::target_platform::gpu_intel>{}(num_rows, num_rhs, num_mirror_rows, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_symm_mirror<plssvm::target_platform::cpu>{}(num_rows, num_rhs, num_mirror_rows, device_num_rows, device_row_offset, alpha, A, B, beta, C);
+            break;
+    }
+}
+
+template <plssvm::kernel_function_type kernel_function, typename... Args>
+void device_kernel_assembly_symm(const plssvm::real_type alpha, const std::vector<plssvm::real_type> &q, const plssvm::soa_matrix<plssvm::real_type> &data, const std::size_t device_num_rows, const std::size_t device_row_offset, const plssvm::real_type QA_cost, const plssvm::real_type cost, const plssvm::soa_matrix<plssvm::real_type> &B, plssvm::soa_matrix<plssvm::real_type> &C, Args... kernel_function_parameter) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_assembly_symm<plssvm::target_platform::gpu_nvidia, kernel_function, Args...>{}(alpha, q, data, device_num_rows, device_row_offset, QA_cost, cost, B, C, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_assembly_symm<plssvm::target_platform::gpu_amd, kernel_function, Args...>{}(alpha, q, data, device_num_rows, device_row_offset, QA_cost, cost, B, C, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_assembly_symm<plssvm::target_platform::gpu_intel, kernel_function, Args...>{}(alpha, q, data, device_num_rows, device_row_offset, QA_cost, cost, B, C, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_assembly_symm<plssvm::target_platform::cpu, kernel_function, Args...>{}(alpha, q, data, device_num_rows, device_row_offset, QA_cost, cost, B, C, kernel_function_parameter...);
+            break;
+    }
+}
+
+void device_kernel_w_linear(plssvm::soa_matrix<plssvm::real_type> &w, const plssvm::aos_matrix<plssvm::real_type> &alpha, const plssvm::soa_matrix<plssvm::real_type> &support_vectors, const std::size_t device_num_sv, const std::size_t device_sv_offset) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_w_linear<plssvm::target_platform::gpu_nvidia>{}(w, alpha, support_vectors, device_num_sv, device_sv_offset);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_w_linear<plssvm::target_platform::gpu_amd>{}(w, alpha, support_vectors, device_num_sv, device_sv_offset);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_w_linear<plssvm::target_platform::gpu_intel>{}(w, alpha, support_vectors, device_num_sv, device_sv_offset);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_w_linear<plssvm::target_platform::cpu>{}(w, alpha, support_vectors, device_num_sv, device_sv_offset);
+            break;
+    }
+}
+
+void device_kernel_predict_linear(plssvm::aos_matrix<plssvm::real_type> &prediction, const plssvm::soa_matrix<plssvm::real_type> &w, const std::vector<plssvm::real_type> &rho, const plssvm::soa_matrix<plssvm::real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_predict_linear<plssvm::target_platform::gpu_nvidia>{}(prediction, w, rho, predict_points, device_num_predict_points, device_row_offset);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_predict_linear<plssvm::target_platform::gpu_amd>{}(prediction, w, rho, predict_points, device_num_predict_points, device_row_offset);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_predict_linear<plssvm::target_platform::gpu_intel>{}(prediction, w, rho, predict_points, device_num_predict_points, device_row_offset);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_predict_linear<plssvm::target_platform::cpu>{}(prediction, w, rho, predict_points, device_num_predict_points, device_row_offset);
+            break;
+    }
+}
+
+template <plssvm::kernel_function_type kernel_function, typename... Args>
+void device_kernel_predict(plssvm::aos_matrix<plssvm::real_type> &prediction, const plssvm::aos_matrix<plssvm::real_type> &alpha, const std::vector<plssvm::real_type> &rho, const plssvm::soa_matrix<plssvm::real_type> &support_vectors, const plssvm::soa_matrix<plssvm::real_type> &predict_points, const std::size_t device_num_predict_points, const std::size_t device_row_offset, Args... kernel_function_parameter) {
+    switch (plssvm::determine_default_target_platform()) {
+        case plssvm::target_platform::automatic:
+            // may never be reached
+            break;
+        case plssvm::target_platform::gpu_nvidia:
+            plssvm::stdpar::detail::device_kernel_predict<plssvm::target_platform::gpu_nvidia, kernel_function, Args...>{}(prediction, alpha, rho, support_vectors, predict_points, device_num_predict_points, device_row_offset, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_amd:
+            plssvm::stdpar::detail::device_kernel_predict<plssvm::target_platform::gpu_amd, kernel_function, Args...>{}(prediction, alpha, rho, support_vectors, predict_points, device_num_predict_points, device_row_offset, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::gpu_intel:
+            plssvm::stdpar::detail::device_kernel_predict<plssvm::target_platform::gpu_intel, kernel_function, Args...>{}(prediction, alpha, rho, support_vectors, predict_points, device_num_predict_points, device_row_offset, kernel_function_parameter...);
+            break;
+        case plssvm::target_platform::cpu:
+            plssvm::stdpar::detail::device_kernel_predict<plssvm::target_platform::cpu, kernel_function, Args...>{}(prediction, alpha, rho, support_vectors, predict_points, device_num_predict_points, device_row_offset, kernel_function_parameter...);
+            break;
+    }
+}
+
 #include "tests/backends/generic_csvm_tests.hpp"  // generic backend C-SVM tests to instantiate
 
 // generic non-GPU C-SVM tests

From 04afa46d67e6c40886e660b077dfe22daa22a0df Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Jul 2025 19:22:06 +0200
Subject: [PATCH 72/93] Correctly trim the device name in the stdpar Intel LLVM
 backend.

---
 src/plssvm/backends/stdpar/IntelLLVM/csvm.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/plssvm/backends/stdpar/IntelLLVM/csvm.cpp b/src/plssvm/backends/stdpar/IntelLLVM/csvm.cpp
index 0ec757d8a..c8f4ec58f 100644
--- a/src/plssvm/backends/stdpar/IntelLLVM/csvm.cpp
+++ b/src/plssvm/backends/stdpar/IntelLLVM/csvm.cpp
@@ -14,6 +14,7 @@
 #include "plssvm/backends/stdpar/implementation_types.hpp"  // plssvm::stdpar::implementation_type
 #include "plssvm/detail/logging/log.hpp"                    // plssvm::detail::log
 #include "plssvm/detail/logging/log_untracked.hpp"          // plssvm::detail::log_untracked
+#include "plssvm/detail/string_utility.hpp"                 // plssvm::detail::trim
 #include "plssvm/detail/tracking/performance_tracker.hpp"   // plssvm::detail::tracking::tracking_entry, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY
 #include "plssvm/target_platforms.hpp"                      // plssvm::target_platform
 #include "plssvm/verbosity_levels.hpp"                      // plssvm::verbosity_level
@@ -66,7 +67,7 @@ csvm::csvm(const target_platform target) {
                                              target_) };
     }
 
-    const std::vector<std::string> device_names{ default_device.get_info<::sycl::info::device::name>() };
+    const std::vector<std::string> device_names{ std::string{ plssvm::detail::trim(default_device.get_info<::sycl::info::device::name>()) } };
 
     if (comm_.size() > 1) {
         mpi::detail::gather_and_print_csvm_information(comm_, plssvm::backend_type::stdpar, target_, device_names, fmt::format("{}", this->get_implementation_type()));

From 795da8bed061237b75a2bf722c9c80aa6178090a Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Jul 2025 19:32:16 +0200
Subject: [PATCH 73/93] Improve stdpar NVHPC output if the CPU target platform
 is used.

---
 src/plssvm/backends/stdpar/NVHPC/csvm.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/plssvm/backends/stdpar/NVHPC/csvm.cpp b/src/plssvm/backends/stdpar/NVHPC/csvm.cpp
index da8286d27..6140a2631 100644
--- a/src/plssvm/backends/stdpar/NVHPC/csvm.cpp
+++ b/src/plssvm/backends/stdpar/NVHPC/csvm.cpp
@@ -72,7 +72,11 @@ csvm::csvm(const target_platform target) {
         plssvm::detail::log_untracked(verbosity_level::full,
                                       comm_,
                                       "\nUsing stdpar ({}; {}) as backend.\n"
+#if defined(PLSSVM_STDPAR_BACKEND_NVHPC_GPU)
                                       "Found {} stdpar device(s) for the target platform {}:\n",
+#else
+                                      "Found {} stdpar device(s) for the target platform {}.\n",
+#endif
                                       this->get_implementation_type(),
                                       detail::get_stdpar_version(),
                                       this->num_available_devices(),

From c28ba90465fb8830b65c1ded13acad3e24634b3b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Jul 2025 19:32:51 +0200
Subject: [PATCH 74/93] Use omp_set_max_active_levels instead of the deprecated
 omp_set_nested.

---
 src/plssvm/backends/Kokkos/detail/device_wrapper.cpp | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
index 00202edf7..e899b1048 100644
--- a/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_wrapper.cpp
@@ -119,15 +119,8 @@ std::vector<device_wrapper> get_device_list(const execution_space space, [[maybe
             break;
         case execution_space::openmp:
             PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP([&]() {
-                // Note: if OpenMP should be used as device  must be set in order for it to work!
-                if (omp_get_nested() == 0) {
-                    ::plssvm::detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                                    comm,
-                                                    "WARNING: In order for Kokkos::OpenMP to work properly, we have to set \"omp_set_nested(1)\"!\n");
-                    // enable OMP_NESTED support
-                    // Note: function is officially deprecated but still necessary for Kokkos::OpenMP to work properly
-                    omp_set_nested(1);
-                }
+                // Note: if OpenMP should be used as device OMP_NESTED must be set in order for it to work!
+                omp_set_max_active_levels(2);
                 devices.emplace_back(Kokkos::OpenMP{});
             });
             break;

From 9d35cc86ec5d6e064b205de50448e582fed4abfc Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 7 Jul 2025 20:10:59 +0200
Subject: [PATCH 75/93] If Kokkos::Experimental::HPX is used, we explicitly
 have to initialize the HPX runtime before a call to Kokkos::initialize,
 otherwise the HPX specific command line options are ignored.

---
 include/plssvm/environment.hpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp
index 3dec0a5c6..ba95ca71b 100644
--- a/include/plssvm/environment.hpp
+++ b/include/plssvm/environment.hpp
@@ -23,12 +23,15 @@
 #include "plssvm/exceptions/exceptions.hpp"  // plssvm::environment_exception
 #include "plssvm/mpi/environment.hpp"        // plssvm::mpi::{is_initialized, init}
 
-#if defined(PLSSVM_HAS_HPX_BACKEND)
+#if defined(PLSSVM_HAS_HPX_BACKEND) || defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HPX)
     #include "hpx/execution.hpp"  // ::hpx::post
+    #include "hpx/hpx_main.hpp"   // disable support for HPX's short command line aliases
     #include "hpx/hpx_start.hpp"  // ::hpx::{start, stop, finalize}
     #include "hpx/runtime.hpp"    // ::hpx::{is_running, is_stopped}
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    #include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
+
     #include "Kokkos_Core.hpp"  // Kokkos::is_initialized, Kokkos::is_finalized, Kokkos::initialize, Kokkos::finalize
 #endif
 
@@ -239,6 +242,9 @@ inline void initialize_backend([[maybe_unused]] const backend_type backend, [[ma
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     if (backend == backend_type::kokkos) {
+        PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX([&]() {
+            ::hpx::start(nullptr, argc, argv);
+        });
         Kokkos::initialize(argc, argv);
     }
 #endif

From 49658c9ca0c8335d4202465e04813c27f61dad18 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 10 Jul 2025 19:24:38 +0200
Subject: [PATCH 76/93] Explicitly use the full namespace to prevent problems
 if the macros are used inside another namespace.

---
 .../Kokkos/detail/conditional_execution.hpp   | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
index 559c9e75c..724c9a611 100644
--- a/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
+++ b/include/plssvm/backends/Kokkos/detail/conditional_execution.hpp
@@ -42,9 +42,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_CUDA(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::cuda) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::cuda) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_CUDA(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::cuda) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::cuda) }
 #endif
 
 //***************************************************//
@@ -65,9 +65,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HIP(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hip) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::hip) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HIP(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hip) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::hip) }
 #endif
 
 //***************************************************//
@@ -88,9 +88,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SYCL(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::sycl) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::sycl) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SYCL(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::sycl) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::sycl) }
 #endif
 
 //***************************************************//
@@ -111,9 +111,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_HPX(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hpx) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::hpx) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::hpx) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::hpx) }
 #endif
 
 //***************************************************//
@@ -134,9 +134,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMP(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openmp) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMP(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openmp) }
 #endif
 
 //***************************************************//
@@ -157,9 +157,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENMPTARGET(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp_target) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openmp_target) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENMPTARGET(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openmp_target) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openmp_target) }
 #endif
 
 //***************************************************//
@@ -180,9 +180,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_OPENACC(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openacc) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openacc) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_OPENACC(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::openacc) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::openacc) }
 #endif
 
 //***************************************************//
@@ -203,9 +203,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_THREADS(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::threads) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::threads) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_THREADS(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::threads) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::threads) }
 #endif
 
 //***************************************************//
@@ -228,9 +228,9 @@ namespace plssvm::kokkos::detail {
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) std::invoke(func)
 #else
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_RETURN_IF_SERIAL(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::serial) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::serial) }
     #define PLSSVM_KOKKOS_BACKEND_INVOKE_IF_SERIAL(func) \
-        throw backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", execution_space::serial) }
+        throw plssvm::kokkos::backend_exception { fmt::format("The Kokkos ExecutionSpace {} is not available!", plssvm::kokkos::execution_space::serial) }
 #endif
 
 }  // namespace plssvm::kokkos::detail

From 1490b7e4a4d66cdb0ef09d43bb441f63b58c14d0 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 10 Jul 2025 19:28:39 +0200
Subject: [PATCH 77/93] Use simple ifdef instead of the
 PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX macro.

---
 include/plssvm/environment.hpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp
index ba95ca71b..b4a6e283c 100644
--- a/include/plssvm/environment.hpp
+++ b/include/plssvm/environment.hpp
@@ -30,8 +30,6 @@
     #include "hpx/runtime.hpp"    // ::hpx::{is_running, is_stopped}
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    #include "plssvm/backends/Kokkos/detail/conditional_execution.hpp"  // PLSSVM_KOKKOS_BACKEND_INVOKE_IF_*
-
     #include "Kokkos_Core.hpp"  // Kokkos::is_initialized, Kokkos::is_finalized, Kokkos::initialize, Kokkos::finalize
 #endif
 
@@ -242,9 +240,9 @@ inline void initialize_backend([[maybe_unused]] const backend_type backend, [[ma
 #endif
 #if defined(PLSSVM_HAS_KOKKOS_BACKEND)
     if (backend == backend_type::kokkos) {
-        PLSSVM_KOKKOS_BACKEND_INVOKE_IF_HPX([&]() {
-            ::hpx::start(nullptr, argc, argv);
-        });
+    #if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HPX)
+        ::hpx::start(nullptr, argc, argv);
+    #endif
         Kokkos::initialize(argc, argv);
     }
 #endif

From d912ce5557d2c9e17ac98267c35d68190c366d98 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 10 Jul 2025 19:51:18 +0200
Subject: [PATCH 78/93] Refactor some parser functionality into utility
 functions to reduce code duplication. Add the possibility to filter out some
 command line options (mainly from third party libraries HPX and Kokkos).

---
 CMakeLists.txt                           |   1 +
 include/plssvm/detail/cmd/utility.hpp    |  94 ++++++++++++
 include/plssvm/environment.hpp           |   6 +-
 src/main_predict.cpp                     |   2 +-
 src/main_train.cpp                       |   2 +-
 src/plssvm/detail/cmd/parser_predict.cpp | 110 ++++----------
 src/plssvm/detail/cmd/parser_train.cpp   | 119 ++++-----------
 src/plssvm/detail/cmd/utility.cpp        | 177 +++++++++++++++++++++++
 8 files changed, 335 insertions(+), 176 deletions(-)
 create mode 100644 include/plssvm/detail/cmd/utility.hpp
 create mode 100644 src/plssvm/detail/cmd/utility.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b57ed26d8..97edf2f5f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ set(PLSSVM_BASE_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/parser_predict.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/parser_scale.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/parser_train.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/cmd/utility.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/io/file_reader.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/data_distribution.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/memory_size.cpp
diff --git a/include/plssvm/detail/cmd/utility.hpp b/include/plssvm/detail/cmd/utility.hpp
new file mode 100644
index 000000000..9cc689868
--- /dev/null
+++ b/include/plssvm/detail/cmd/utility.hpp
@@ -0,0 +1,94 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Utility functions related to the command line parser functionality.
+ */
+
+#ifndef PLSSVM_DETAIL_CMD_UTILITY_HPP_
+#define PLSSVM_DETAIL_CMD_UTILITY_HPP_
+#pragma once
+
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernels
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity_level
+
+#include "cxxopts.hpp"  // cxxopts::ParseResult, cxxopts::Options
+
+#include <cstddef>   // std::size_t
+#include <optional>  // std::optional
+#include <string>    // std::string
+#include <utility>   // std::pair
+#include <vector>    // std::vector
+
+namespace plssvm::detail::cmd {
+
+/**
+ * @brief Filter the provided command line options starting with the @p prefix_filter.
+ * @details Currently, per default filters out all options starting with "--hpx:" and "--kokkos-".
+ * @attention **ONLY** single command line options are supported! I.e., "--hpx:threads=42" is supported, but not "--hpx:threads 42".
+ * @param[in] argc the number of provided command line options to be filtered
+ * @param[in] argv the command line options to be filtered
+ * @param[in] prefix_filter a list of prefixes that should be filtered
+ * @return a `std::vector` containing all non-filtered command line options (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::vector<char *> filter_argv(int argc, char **argv, const std::vector<std::string> &prefix_filter = { "--hpx:", "--kokkos-" });
+
+/**
+ * @brief Assemble a more detailed help message for the kernel function types also containing their mathematical formula.
+ * @return the kernel functions' help message (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::string kernel_type_help_message();
+
+/**
+ * @brief If a SYCL backend is available, parse the SYCL specific command line options "--sycl_data_parallel_kernel" and "--sycl_implementation_type".
+ * @details If a SYCL backend is available, returns the two parsed command line options wrapped in a `std::pair`, otherwise returns a `std::nullopt`.
+ * @param[in] result the cxxopts parser result encapsulating the command line options
+ * @param[in] comm the MPI communicator
+ * @param[in] backend the requested backend
+ * @param[in] target the requested target platform
+ * @return the parsed, SYCL specific command line options (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::optional<std::pair<sycl::data_parallel_kernel, sycl::implementation_type>> parse_and_check_sycl_options_if_available(const cxxopts::ParseResult &result, const mpi::communicator &comm, backend_type backend, target_platform target);
+
+/**
+ * @brief If the Kokkos backend is available, parse the Kokkos specific command line option "--kokkos_execution_space".
+ * @details If the Kokkos backend is available, returns the parsed command line option, otherwise returns a `std::nullopt`.
+ * @param[in] result the cxxopts parser result encapsulating the command line option
+ * @param[in] comm the MPI communicator
+ * @param[in] backend the requested backend
+ * @param[in] target the requested target platform
+ * @return the parsed, Kokkos specific command line option (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::optional<kokkos::execution_space> parse_and_check_kokkos_options_if_available(const cxxopts::ParseResult &result, const mpi::communicator &comm, backend_type backend, target_platform target);
+
+/**
+ * @brief If MPI is available, parse the MPI specific command line option "--mpi_load_balancing_weights".
+ * @details If MPI is available, returns the parsed command line option, otherwise returns a `std::nullopt`.
+ * @param[in] result the cxxopts parser result encapsulating the command line option
+ * @param[in] options all supported command line options
+ * @param[in] comm the MPI communicator
+ * @return the parsed, MPI specific command line option (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::optional<std::vector<std::size_t>> parse_and_check_mpi_options_if_available(const cxxopts::ParseResult &result, const cxxopts::Options &options, const mpi::communicator &comm);
+
+/**
+ * @brief Parse the verbosity command line option.
+ * @details If it was provided, returns the parsed value, otherwise returns a `std::nullopt`.
+ * @param[in] result the cxxopts parser result encapsulating the command line option
+ * @param[in] comm the MPI communicator
+ * @return the parsed verbosity command line option (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::optional<verbosity_level> parse_verbosity(const cxxopts::ParseResult &result, const mpi::communicator &comm);
+
+}  // namespace plssvm::detail::cmd
+
+#endif  // PLSSVM_DETAIL_CMD_UTILITY_HPP_
diff --git a/include/plssvm/environment.hpp b/include/plssvm/environment.hpp
index b4a6e283c..b2ac0bb35 100644
--- a/include/plssvm/environment.hpp
+++ b/include/plssvm/environment.hpp
@@ -18,6 +18,7 @@
 
 #include "plssvm/backend_types.hpp"          // plssvm::backend_type, plssvm::list_available_backends
 #include "plssvm/detail/assert.hpp"          // PLSSVM_ASSERT
+#include "plssvm/detail/cmd/utility.hpp"     // plssvm::detail::cmd::filter_argv
 #include "plssvm/detail/string_utility.hpp"  // plssvm::detail::to_lower_case
 #include "plssvm/detail/utility.hpp"         // plssvm::detail::{contains, unreachable}
 #include "plssvm/exceptions/exceptions.hpp"  // plssvm::environment_exception
@@ -243,7 +244,10 @@ inline void initialize_backend([[maybe_unused]] const backend_type backend, [[ma
     #if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HPX)
         ::hpx::start(nullptr, argc, argv);
     #endif
-        Kokkos::initialize(argc, argv);
+        // we have to filter out our "--kokkos_execution_space" command line option or Kokkos itself will issue a warning on the command line
+        std::vector<char *> filtered_argv = plssvm::detail::cmd::filter_argv(argc, argv, { "--kokkos_" });
+        int filtered_argc = static_cast<int>(filtered_argv.size());
+        Kokkos::initialize(filtered_argc, filtered_argv.data());
     }
 #endif
 }
diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index 70e4cdad5..60aa4bb10 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -124,7 +124,7 @@ int main(int argc, char *argv[]) {
             if (use_kokkos_as_backend) {
                 backends_to_initialize.push_back(plssvm::backend_type::kokkos);
             }
-            environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
+            environment_guard = std::make_unique<plssvm::environment::scope_guard>(argc, argv, backends_to_initialize);
 
             // create default csvm
             const std::unique_ptr<csvm_type> svm = [&]() {
diff --git a/src/main_train.cpp b/src/main_train.cpp
index 924cc1191..7811b4e82 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -158,7 +158,7 @@ int main(int argc, char *argv[]) {
             if (use_kokkos_as_backend) {
                 backends_to_initialize.push_back(plssvm::backend_type::kokkos);
             }
-            environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
+            environment_guard = std::make_unique<plssvm::environment::scope_guard>(argc, argv, backends_to_initialize);
 
             // create SVM
             const std::unique_ptr<csvm_type> svm = [&]() {
diff --git a/src/plssvm/detail/cmd/parser_predict.cpp b/src/plssvm/detail/cmd/parser_predict.cpp
index cbe64e12f..23585123d 100644
--- a/src/plssvm/detail/cmd/parser_predict.cpp
+++ b/src/plssvm/detail/cmd/parser_predict.cpp
@@ -9,19 +9,21 @@
 #include "plssvm/detail/cmd/parser_predict.hpp"
 
 #include "plssvm/backend_types.hpp"                        // plssvm::list_available_backends
-#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::list_available_execution_spaces
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::{list_available_execution_spaces, execution_space}
 #include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::{list_available_sycl_data_parallel_kernels, data_parallel_kernels}
-#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::list_available_sycl_implementations
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::{list_available_sycl_implementations, implementation_type}
 #include "plssvm/constants.hpp"                            // plssvm::real_type
 #include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
+#include "plssvm/detail/cmd/utility.hpp"                   // plssvm::detail::cmd::{filter_argv, kernel_type_help_message, parse_and_check_sycl_options_if_available,
+                                                           // parse_and_check_kokkos_options_if_available, parse_and_check_mpi_options_if_available, parse_verbosity}
 #include "plssvm/detail/logging/mpi_log_untracked.hpp"     // plssvm::detail::log_untracked
 #include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
 #include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
-#include "plssvm/target_platforms.hpp"                     // plssvm::list_available_target_platforms
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform, plssvm::list_available_target_platforms
 #include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity, plssvm::verbosity_level
 #include "plssvm/version/version.hpp"                      // plssvm::version::detail::get_version_info
 
-#include "cxxopts.hpp"   // cxxopts::{Options, value, ParseResult}
+#include "cxxopts.hpp"   // cxxopts::Options, cxxopts::value, cxxopts::ParseResult
 #include "fmt/color.h"   // fmt::fg, fmt::color::orange
 #include "fmt/format.h"  // fmt::format
 #include "fmt/ranges.h"  // fmt::join
@@ -30,7 +32,9 @@
 #include <exception>    // std::exception
 #include <filesystem>   // std::filesystem::path
 #include <iostream>     // std::cout, std::cerr, std::endl
+#include <optional>     // std::optional
 #include <type_traits>  // std::is_same_v
+#include <utility>      // std::pair, std::move
 #include <vector>       // std::vector
 
 namespace plssvm::detail::cmd {
@@ -40,6 +44,9 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
     PLSSVM_ASSERT(argc >= 1, fmt::format("At least one argument is always given (the executable name), but argc is {}!", argc));
     PLSSVM_ASSERT(argv != nullptr, "At least one argument is always given (the executable name), but argv is a nullptr!");
 
+    // filter the command line arguments removing third party options
+    std::vector<char *> filtered_args = filter_argv(argc, argv);
+
     // setup command line parser with all available options
     cxxopts::Options options("plssvm-predict", "LS-SVM with multiple (GPU-)backends");
     options
@@ -79,7 +86,7 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
     cxxopts::ParseResult result;
     try {
         options.parse_positional({ "test", "model", "output" });
-        result = options.parse(argc, argv);
+        result = options.parse(static_cast<int>(filtered_args.size()), filtered_args.data());
     } catch (const std::exception &e) {
         if (comm.is_main_rank()) {
             std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: {}\n", e.what()) << std::endl;
@@ -119,75 +126,26 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
     // parse target_platform and cast the value to the respective enum
     target = result["target_platform"].as<decltype(target)>();
 
-#if defined(PLSSVM_HAS_SYCL_BACKEND)
-    {
-        // parse the data parallel kernel when using SYCL as backend
-        sycl_data_parallel_kernel = result["sycl_data_parallel_kernel"].as<decltype(sycl_data_parallel_kernel)>();
-
-        // assemble warning condition
-        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
-        const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
-
-        // warn if the data parallel kernel is explicitly set but SYCL isn't the current (automatic) backend
-        if (!sycl_backend_is_used && sycl_data_parallel_kernel != sycl::data_parallel_kernel::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a SYCL data parallel kernel but the current backend isn't SYCL; ignoring --sycl_data_parallel_kernel={}\n",
-                                  sycl_data_parallel_kernel);
-        }
-
-        // parse SYCL implementation used in the SYCL backend
-        sycl_implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
-
-        // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
-        if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
-                                  sycl_implementation_type);
-        }
+    // parse the SYCL related options
+    const std::optional<std::pair<sycl::data_parallel_kernel, sycl::implementation_type>> sycl_options = parse_and_check_sycl_options_if_available(result, comm, backend, target);
+    if (sycl_options.has_value()) {
+        sycl_data_parallel_kernel = sycl_options->first;
+        sycl_implementation_type = sycl_options->second;
     }
-#endif
-
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    {
-        // parse execution space when using Kokkos as backend
-        kokkos_execution_space = result["kokkos_execution_space"].as<decltype(kokkos_execution_space)>();
 
-        // assemble warning condition
-        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
-        const bool kokkos_backend_is_used = backend == backend_type::kokkos || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::kokkos);
-
-        // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend
-        if (!kokkos_backend_is_used && kokkos_execution_space != kokkos::execution_space::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n",
-                                  kokkos_execution_space);
-        }
+    // parse the Kokkos related options
+    const std::optional<kokkos::execution_space> kokkos_options = parse_and_check_kokkos_options_if_available(result, comm, backend, target);
+    if (kokkos_options.has_value()) {
+        kokkos_execution_space = kokkos_options.value();
     }
-#endif
 
     // parse whether strings should be used as labels
     strings_as_labels = result["use_strings_as_labels"].as<decltype(strings_as_labels)>();
 
-    // parse whether output is quiet or not
-    const bool quiet = result["quiet"].as<bool>();
-
     // -q/--quiet has precedence over --verbosity
-    if (result["verbosity"].count()) {
-        const verbosity_level verb = result["verbosity"].as<verbosity_level>();
-        if (quiet && verb != verbosity_level::quiet) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set the -q/--quiet flag, but the provided verbosity level isn't \"quiet\"; setting --verbosity={} to --verbosity=quiet\n",
-                                  verb);
-            verbosity = verbosity_level::quiet;
-        } else {
-            verbosity = verb;
-        }
-    } else if (quiet) {
-        verbosity = verbosity_level::quiet;
+    const std::optional<verbosity_level> verb = parse_verbosity(result, comm);
+    if (verb.has_value()) {
+        verbosity = verb.value();
     }
 
     // parse test data filename
@@ -218,28 +176,16 @@ parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **a
         predict_filename = input_path.filename().string() + ".predict";
     }
 
-#if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
     // parse performance tracking filename
     if (result.count("performance_tracking")) {
         performance_tracking_filename = result["performance_tracking"].as<decltype(performance_tracking_filename)>();
     }
-#endif
-
-#if defined(PLSSVM_HAS_MPI_ENABLED)
-    // parse MPI load balancing factors
-    if (result.count("mpi_load_balancing_weights")) {
-        mpi_load_balancing_weights = result["mpi_load_balancing_weights"].as<decltype(mpi_load_balancing_weights)>();
 
-        // sanity check provided balance factors
-        if (mpi_load_balancing_weights.size() != comm.size()) {
-            if (comm.is_main_rank()) {
-                std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: the number of load balancing weights ({}) must match the number of MPI ranks ({})!\n", mpi_load_balancing_weights.size(), comm.size()) << std::endl;
-                std::cout << options.help() << std::endl;
-            }
-            throw cmd_parser_exit{ EXIT_FAILURE };
-        }
+    // parse the MPI related options
+    std::optional<std::vector<std::size_t>> mpi_options = parse_and_check_mpi_options_if_available(result, options, comm);
+    if (mpi_options.has_value()) {
+        mpi_load_balancing_weights = std::move(mpi_options.value());
     }
-#endif
 }
 
 std::ostream &operator<<(std::ostream &out, const parser_predict &params) {
diff --git a/src/plssvm/detail/cmd/parser_train.cpp b/src/plssvm/detail/cmd/parser_train.cpp
index 288aa3c1e..033f0a0e8 100644
--- a/src/plssvm/detail/cmd/parser_train.cpp
+++ b/src/plssvm/detail/cmd/parser_train.cpp
@@ -8,26 +8,26 @@
 
 #include "plssvm/detail/cmd/parser_train.hpp"
 
-#include "plssvm/backend_types.hpp"                        // plssvm::list_available_backends, plssvm::determine_default_backend
+#include "plssvm/backend_types.hpp"                        // plssvm::list_available_backends
 #include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::{list_available_execution_spaces, execution_space}
 #include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::{list_available_sycl_data_parallel_kernels, data_parallel_kernels}
 #include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::{list_available_sycl_implementations, implementation_type}
 #include "plssvm/classification_types.hpp"                 // plssvm::classification_type, plssvm::classification_type_to_full_string
 #include "plssvm/constants.hpp"                            // plssvm::real_type
 #include "plssvm/detail/assert.hpp"                        // PLSSVM_ASSERT
+#include "plssvm/detail/cmd/utility.hpp"                   // plssvm::detail::cmd::{filter_argv, kernel_type_help_message, parse_and_check_sycl_options_if_available,
+                                                           // parse_and_check_kokkos_options_if_available, parse_and_check_mpi_options_if_available, parse_verbosity}
 #include "plssvm/detail/logging/mpi_log_untracked.hpp"     // plssvm::detail::log_untracked
 #include "plssvm/detail/utility.hpp"                       // plssvm::detail::to_underlying
 #include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
 #include "plssvm/gamma.hpp"                                // plssvm::get_gamma_string
-#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_type_to_math_string
 #include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
-#include "plssvm/mpi/environment.hpp"                      // plssvm::mpi::{is_active, finalize}
 #include "plssvm/svm_types.hpp"                            // plssvm::svm_type
-#include "plssvm/target_platforms.hpp"                     // plssvm::list_available_target_platforms
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform, plssvm::list_available_target_platforms
 #include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity, plssvm::verbosity_level
 #include "plssvm/version/version.hpp"                      // plssvm::version::detail::get_version_info
 
-#include "cxxopts.hpp"   // cxxopts::Options, cxxopts::value,cxxopts::ParseResult
+#include "cxxopts.hpp"   // cxxopts::Options, cxxopts::value, cxxopts::ParseResult
 #include "fmt/color.h"   // fmt::fg, fmt::color::red
 #include "fmt/format.h"  // fmt::format
 #include "fmt/ranges.h"  // fmt::join
@@ -36,8 +36,10 @@
 #include <exception>    // std::exception
 #include <filesystem>   // std::filesystem::path
 #include <iostream>     // std::cout, std::cerr, std::endl
+#include <optional>     // std::optional
 #include <string>       // std::string
 #include <type_traits>  // std::is_same_v
+#include <utility>      // std::pair, std::move
 #include <variant>      // std::holds_alternative, std::get
 #include <vector>       // std::vector
 
@@ -48,15 +50,11 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
     PLSSVM_ASSERT(argc >= 1, fmt::format("At least one argument is always given (the executable name), but argc is {}!", argc));
     PLSSVM_ASSERT(argv != nullptr, "At least one argument is always given (the executable name), but argv is a nullptr!");
 
+    // filter the command line arguments removing third party options
+    std::vector<char *> filtered_args = filter_argv(argc, argv);
+
     // create the help message for the kernel function type
-    const auto kernel_type_to_help_entry = [](const kernel_function_type kernel) {
-        return fmt::format("\t {} -- {}: {}\n", detail::to_underlying(kernel), kernel, kernel_function_type_to_math_string(kernel));
-    };
-    std::string kernel_type_help{ "set type of kernel function. \n" };
-    for (const kernel_function_type kernel : { kernel_function_type::linear, kernel_function_type::polynomial, kernel_function_type::rbf, kernel_function_type::sigmoid, kernel_function_type::laplacian, kernel_function_type::chi_squared }) {
-        kernel_type_help += kernel_type_to_help_entry(kernel);
-    }
-    kernel_type_help.pop_back();  // remove last newline character
+    const std::string kernel_type_help = kernel_type_help_message();
 
     cxxopts::Options options("plssvm-train", "LS-SVM with multiple (GPU-)backends");
     options
@@ -105,7 +103,7 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
     cxxopts::ParseResult result;
     try {
         options.parse_positional({ "input", "model" });
-        result = options.parse(argc, argv);
+        result = options.parse(static_cast<int>(filtered_args.size()), filtered_args.data());
     } catch (const std::exception &e) {
         if (comm.is_main_rank()) {
             std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: {}\n", e.what()) << std::endl;
@@ -221,54 +219,18 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
     // parse the solver_type and cast the value to the respective enum
     solver = result["solver"].as<decltype(solver)>();
 
-#if defined(PLSSVM_HAS_SYCL_BACKEND)
-    {
-        // parse data parallel kernel when using SYCL as backend
-        sycl_data_parallel_kernel = result["sycl_data_parallel_kernel"].as<decltype(sycl_data_parallel_kernel)>();
-
-        // assemble warning condition
-        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
-        const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
-
-        // warn if the data parallel kernel is explicitly set but SYCL isn't the current (automatic) backend
-        if (!sycl_backend_is_used && sycl_data_parallel_kernel != sycl::data_parallel_kernel::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a SYCL data parallel kernel but the current backend isn't SYCL; ignoring --sycl_data_parallel_kernel={}\n",
-                                  sycl_data_parallel_kernel);
-        }
-
-        // parse SYCL implementation used in the SYCL backend
-        sycl_implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
-
-        // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
-        if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
-                                  sycl_implementation_type);
-        }
+    // parse the SYCL related options
+    const std::optional<std::pair<sycl::data_parallel_kernel, sycl::implementation_type>> sycl_options = parse_and_check_sycl_options_if_available(result, comm, backend, target);
+    if (sycl_options.has_value()) {
+        sycl_data_parallel_kernel = sycl_options->first;
+        sycl_implementation_type = sycl_options->second;
     }
-#endif
 
-#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
-    {
-        // parse execution space when using Kokkos as backend
-        kokkos_execution_space = result["kokkos_execution_space"].as<decltype(kokkos_execution_space)>();
-
-        // assemble warning condition
-        const std::vector<plssvm::target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
-        const bool kokkos_backend_is_used = backend == backend_type::kokkos || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::kokkos);
-
-        // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend
-        if (!kokkos_backend_is_used && kokkos_execution_space != kokkos::execution_space::automatic) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n",
-                                  kokkos_execution_space);
-        }
+    // parse the Kokkos related options
+    const std::optional<kokkos::execution_space> kokkos_options = parse_and_check_kokkos_options_if_available(result, comm, backend, target);
+    if (kokkos_options.has_value()) {
+        kokkos_execution_space = kokkos_options.value();
     }
-#endif
 
     // parse whether strings should be used as labels for the classification task
     strings_as_labels = result["use_strings_as_labels"].as<decltype(strings_as_labels)>();
@@ -278,23 +240,10 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
                               "WARNING: explicitly requested string labels for the regression task; ignoring --use_strings_as_labels\n");
     }
 
-    // parse whether output is quiet or not
-    const bool quiet = result["quiet"].as<bool>();
-
     // -q/--quiet has precedence over --verbosity
-    if (result["verbosity"].count()) {
-        const verbosity_level verb = result["verbosity"].as<verbosity_level>();
-        if (quiet && verb != verbosity_level::quiet) {
-            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
-                                  comm,
-                                  "WARNING: explicitly set the -q/--quiet flag, but the provided verbosity level isn't \"quiet\"; setting --verbosity={} to --verbosity=quiet\n",
-                                  verb);
-            verbosity = verbosity_level::quiet;
-        } else {
-            verbosity = verb;
-        }
-    } else if (quiet) {
-        verbosity = verbosity_level::quiet;
+    const std::optional<verbosity_level> verb = parse_verbosity(result, comm);
+    if (verb.has_value()) {
+        verbosity = verb.value();
     }
 
     // parse input data filename
@@ -315,28 +264,16 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
         model_filename = input_path.filename().string() + ".model";
     }
 
-#if defined(PLSSVM_PERFORMANCE_TRACKER_ENABLED)
     // parse performance tracking filename
     if (result.count("performance_tracking")) {
         performance_tracking_filename = result["performance_tracking"].as<decltype(performance_tracking_filename)>();
     }
-#endif
 
-#if defined(PLSSVM_HAS_MPI_ENABLED)
-    // parse MPI load balancing factors
-    if (result.count("mpi_load_balancing_weights")) {
-        mpi_load_balancing_weights = result["mpi_load_balancing_weights"].as<decltype(mpi_load_balancing_weights)>();
-
-        // sanity check provided balance factors
-        if (mpi_load_balancing_weights.size() != comm.size()) {
-            if (comm.is_main_rank()) {
-                std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: the number of load balancing weights ({}) must match the number of MPI ranks ({})!\n", mpi_load_balancing_weights.size(), comm.size()) << std::endl;
-                std::cout << options.help() << std::endl;
-            }
-            throw cmd_parser_exit{ EXIT_FAILURE };
-        }
+    // parse the MPI related options
+    std::optional<std::vector<std::size_t>> mpi_options = parse_and_check_mpi_options_if_available(result, options, comm);
+    if (mpi_options.has_value()) {
+        mpi_load_balancing_weights = std::move(mpi_options.value());
     }
-#endif
 }
 
 std::ostream &operator<<(std::ostream &out, const parser_train &params) {
diff --git a/src/plssvm/detail/cmd/utility.cpp b/src/plssvm/detail/cmd/utility.cpp
new file mode 100644
index 000000000..3ee51b0a7
--- /dev/null
+++ b/src/plssvm/detail/cmd/utility.cpp
@@ -0,0 +1,177 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/detail/cmd/utility.hpp"
+
+#include "plssvm/backend_types.hpp"                        // plssvm::backend_type, plssvm::list_available_backends, plssvm::determine_default_backend
+#include "plssvm/backends/Kokkos/execution_space.hpp"      // plssvm::kokkos::execution_space
+#include "plssvm/backends/SYCL/data_parallel_kernels.hpp"  // plssvm::sycl::data_parallel_kernels
+#include "plssvm/backends/SYCL/implementation_types.hpp"   // plssvm::sycl::implementation_type
+#include "plssvm/detail/logging/mpi_log_untracked.hpp"     // plssvm::detail::log_untracked
+#include "plssvm/detail/string_utility.hpp"                // plssvm::detail::starts_with
+#include "plssvm/detail/utility.hpp"                       // plssvm::detail::to_underlying
+#include "plssvm/exceptions/exceptions.hpp"                // plssvm::cmd_parser_exit
+#include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
+#include "plssvm/target_platforms.hpp"                     // plssvm::target_platform, plssvm::determine_default_target_platform
+#include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity_level
+
+#include "cxxopts.hpp"   // cxxopts::ParseResult, cxxopts::Options
+#include "fmt/color.h"   // fmt::fg, fmt::color::red
+#include "fmt/format.h"  // fmt::format
+
+#include <cstddef>   // std::size_t
+#include <cstdlib>   // EXIT_FAILURE
+#include <iostream>  // std::cout, std::cerr, std::endl
+#include <optional>  // std::optional, std::nullopt
+#include <string>    // std::string
+#include <utility>   // std::pair, std::make_pair
+#include <vector>    // std::vector
+
+namespace plssvm::detail::cmd {
+
+std::vector<char *> filter_argv(int argc, char **argv, const std::vector<std::string> &prefix_filter) {
+    // We ignore all command line options starting with --hpx: like --hpx:threads=42.
+    // We also ignore all command line options starting with --kokkos-.
+    // NOTE: this does not include OUR command line option --kokkos_execution_space.
+    std::vector<char *> filtered_argv{ argv[0] };
+    for (std::size_t i = 1; i < static_cast<std::size_t>(argc); ++i) {
+        bool remove_option = false;
+
+        // check whether the current command line option starts with any of the provided prefixes
+        for (const std::string &prefix : prefix_filter) {
+            if (detail::starts_with(argv[i], prefix)) {
+                remove_option = true;
+            }
+        }
+
+        // only add the command line options that should not be removed
+        if (!remove_option) {
+            filtered_argv.push_back(argv[i]);
+        }
+    }
+
+    return filtered_argv;
+}
+
+std::string kernel_type_help_message() {
+    // create the help message for the kernel function type
+    const auto kernel_type_to_help_entry = [](const kernel_function_type kernel) {
+        return fmt::format("\t {} -- {}: {}\n", detail::to_underlying(kernel), kernel, kernel_function_type_to_math_string(kernel));
+    };
+    std::string kernel_type_help{ "set type of kernel function. \n" };
+    for (const kernel_function_type kernel : { kernel_function_type::linear, kernel_function_type::polynomial, kernel_function_type::rbf, kernel_function_type::sigmoid, kernel_function_type::laplacian, kernel_function_type::chi_squared }) {
+        kernel_type_help += kernel_type_to_help_entry(kernel);
+    }
+    kernel_type_help.pop_back();  // remove last newline character
+
+    return kernel_type_help;
+}
+
+std::optional<std::pair<sycl::data_parallel_kernel, sycl::implementation_type>> parse_and_check_sycl_options_if_available([[maybe_unused]] const cxxopts::ParseResult &result, [[maybe_unused]] const mpi::communicator &comm, [[maybe_unused]] const backend_type backend, [[maybe_unused]] const target_platform target) {
+#if defined(PLSSVM_HAS_SYCL_BACKEND)
+    // parse the data parallel kernel when using SYCL as backend
+    const sycl::data_parallel_kernel data_parallel_kernel = result["sycl_data_parallel_kernel"].as<sycl::data_parallel_kernel>();
+
+    // assemble warning condition
+    const std::vector<target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
+    const bool sycl_backend_is_used = backend == backend_type::sycl || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::sycl);
+
+    // warn if the data parallel kernel is explicitly set but SYCL isn't the current (automatic) backend
+    if (!sycl_backend_is_used && data_parallel_kernel != sycl::data_parallel_kernel::automatic) {
+        detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                              comm,
+                              "WARNING: explicitly set a SYCL data parallel kernel but the current backend isn't SYCL; ignoring --sycl_data_parallel_kernel={}\n",
+                              data_parallel_kernel);
+    }
+
+    // parse the SYCL implementation used in the SYCL backend
+    const sycl::implementation_type implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
+
+    // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
+    if (!sycl_backend_is_used && implementation_type != sycl::implementation_type::automatic) {
+        detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                              comm,
+                              "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
+                              implementation_type);
+    }
+
+    return std::make_pair(data_parallel_kernel, implementation_type);
+#else
+    return std::nullopt;
+#endif
+}
+
+[[nodiscard]] std::optional<kokkos::execution_space> parse_and_check_kokkos_options_if_available([[maybe_unused]] const cxxopts::ParseResult &result, [[maybe_unused]] const mpi::communicator &comm, [[maybe_unused]] const backend_type backend, [[maybe_unused]] const target_platform target) {
+#if defined(PLSSVM_HAS_KOKKOS_BACKEND)
+    // parse execution space when using Kokkos as backend
+    const kokkos::execution_space execution_space = result["kokkos_execution_space"].as<kokkos::execution_space>();
+
+    // assemble warning condition
+    const std::vector<target_platform> target_platforms = { target == target_platform::automatic ? determine_default_target_platform() : target };
+    const bool kokkos_backend_is_used = backend == backend_type::kokkos || (backend == backend_type::automatic && determine_default_backend(list_available_backends(), target_platforms) == backend_type::kokkos);
+
+    // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend
+    if (!kokkos_backend_is_used && execution_space != kokkos::execution_space::automatic) {
+        detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                              comm,
+                              "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n",
+                              execution_space);
+    }
+
+    return execution_space;
+#else
+    return std::nullopt;
+#endif
+}
+
+std::optional<std::vector<std::size_t>> parse_and_check_mpi_options_if_available([[maybe_unused]] const cxxopts::ParseResult &result, [[maybe_unused]] const cxxopts::Options &options, [[maybe_unused]] const mpi::communicator &comm) {
+#if defined(PLSSVM_HAS_MPI_ENABLED)
+    // parse MPI load balancing factors
+    if (result.count("mpi_load_balancing_weights")) {
+        std::vector<std::size_t> mpi_load_balancing_weights = result["mpi_load_balancing_weights"].as<std::vector<std::size_t>>();
+
+        // sanity-check provided balance factors
+        if (mpi_load_balancing_weights.size() != comm.size()) {
+            if (comm.is_main_rank()) {
+                std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: the number of load balancing weights ({}) must match the number of MPI ranks ({})!\n", mpi_load_balancing_weights.size(), comm.size()) << std::endl;
+                std::cout << options.help() << std::endl;
+            }
+            throw cmd_parser_exit{ EXIT_FAILURE };
+        }
+        return std::make_optional(std::move(mpi_load_balancing_weights));
+    }
+    return std::nullopt;
+#else
+    return std::nullopt;
+#endif
+}
+
+std::optional<verbosity_level> parse_verbosity(const cxxopts::ParseResult &result, const mpi::communicator &comm) {
+    // parse whether output is quiet or not
+    const bool quiet = result["quiet"].as<bool>();
+
+    if (result["verbosity"].count()) {
+        const verbosity_level verb = result["verbosity"].as<verbosity_level>();
+        if (quiet && verb != verbosity_level::quiet) {
+            detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  comm,
+                                  "WARNING: explicitly set the -q/--quiet flag, but the provided verbosity level isn't \"quiet\"; setting --verbosity={} to --verbosity=quiet\n",
+                                  verb);
+            return verbosity_level::quiet;
+        } else {
+            return verb;
+        }
+    } else if (quiet) {
+        return verbosity_level::quiet;
+    } else {
+        return std::nullopt;
+    }
+}
+
+}  // namespace plssvm::detail::cmd

From 8adade441a31f3008c6b54294ef7946bf1dc44d5 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 10 Jul 2025 20:13:00 +0200
Subject: [PATCH 79/93] Improve the README file (grammar related stuff).

---
 README.md | 69 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index d07f0599e..24efdb785 100644
--- a/README.md
+++ b/README.md
@@ -31,38 +31,38 @@
 A [Support Vector Machine (SVM)](https://en.wikipedia.org/wiki/Support-vector_machine) is a supervised machine learning model.
 In its basic form SVMs are used for binary classification tasks.
 Their fundamental idea is to learn a hyperplane which separates the two classes best, i.e., where the widest possible margin around its decision boundary is free of data.
-This is also the reason, why SVMs are also called "large margin classifiers".
+This is also the reason, why SVMs are also called "large margin classifiers."
 To predict to which class a new, unseen data point belongs, the SVM simply has to calculate on which side of the previously calculated hyperplane the data point lies.
-This is very efficient since it only involves a single scalar product of the size corresponding to the numer of features of the data set.
+This is very efficient since it only involves a single scalar product of the size corresponding to the numer of features per data point in the data set.
 
-<p align="center">
+<p style="text-align: center;">
   <img alt="Basic idea of an Support Vector Machine as classification model." src="https://github.com/SC-SGS/PLSSVM/raw/main/.figures/support_vector_machine.png" width="50%">
 </p>
 
-However, normal SVMs suffer in their potential parallelizability.
+However, normal SVMs suffer from their potential parallelizability.
 Determining the hyperplane boils down to solving a convex quadratic problem.
 For this, most SVM implementations use Sequential Minimal Optimization (SMO), an inherently sequential algorithm.
 The basic idea of this algorithm is that it takes a pair of data points and calculates the hyperplane between them.
 Afterward, two new data points are selected and the existing hyperplane is adjusted accordingly.
-This procedure is repeat until a new adjustment would be smaller than some epsilon greater than zero.
+This procedure is repeated until a new adjustment would be smaller than some epsilon greater than zero.
 
 Some SVM implementations try to harness some parallelization potential by not drawing point pairs but group of points.
 In this case, the hyperplane calculation inside this group is parallelized.
-However, even then modern highly parallel hardware can not be utilized efficiently.
+However, even then, modern highly parallel hardware cannot be utilized efficiently.
 
 Therefore, we implemented a version of the original proposed SVM called [Least Squares Support Vector Machine (LS-SVM)](https://en.wikipedia.org/wiki/Least-squares_support-vector_machine).
 The LS-SVMs reformulated the original problem such that it boils down to solving a system of linear equations.
-For this kind of problem many highly parallel algorithms and implementations are known.
+For this kind of problem, many highly parallel algorithms and implementations are known.
 We decided to use the [Conjugate Gradient (CG)](https://en.wikipedia.org/wiki/Conjugate_gradient_method) to solve the system of linear equations.
 
 The main highlights of our SVM implementations are:
 1. Drop-in replacement for LIBSVM's `svm-train`, `svm-predict`, and `svm-scale` (some features currently not implemented).
-2. Support of multiple different programming frameworks for parallelization (also called backends in our PLSSVM implementation) which allows us to target GPUs and CPUs from different vendors like NVIDIA, AMD, or Intel:
+2. Support for multiple different programming frameworks for parallelization (also called backends in our PLSSVM implementation) which allows us to support GPUs and CPUs from different vendors like NVIDIA, AMD, or Intel:
    - [OpenMP](https://www.openmp.org/)
    - [HPX](https://hpx.stellar-group.org/) (tested with current master)
    - C++ 17's standard parallelism [stdpar](https://en.cppreference.com/w/cpp/algorithm):<br>
      **Note**: due to the nature of the used USM mechanics in the `stdpar` implementations, the `stdpar` backend **can't** be enabled together with **any** other backend! <br>
-     **Note**: since every translation units need to be compiled with the same flag, we currently globally set `CMAKE_CXX_FLAGS` although it's discouraged.
+     **Note**: since every translation unit needs to be compiled with the same flag, we currently globally set `CMAKE_CXX_FLAGS` although it's discouraged.
      - [nvc++](https://developer.nvidia.com/hpc-sdk) from NVIDIA's HPC SDK (tested with version [25.3](https://docs.nvidia.com/hpc-sdk/hpc-sdk-release-notes/index.html))
      - [roc-stdpar](https://github.com/ROCm/roc-stdpar) merged into upstream LLVM starting with version 18 (tested with version [18](https://releases.llvm.org/))
      - [icpx](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler.html) as Intel's oneAPI compiler (tested with version [2025.0.0](https://www.intel.com/content/www/us/en/developer/articles/release-notes/oneapi-dpcpp/2025.html))
@@ -83,7 +83,7 @@ The main highlights of our SVM implementations are:
    - laplacian: $\exp(-\gamma$ $\cdot |$ $\vec{u}$ $-$ $\vec{v}$ $|_1)$
    - chi-squared (only well-defined for values > 0): $\exp(-\gamma \cdot \sum_i \frac{(x[i] - y[i])^2}{x[i] + y[i]})$
 4. Two different solver types for a trade-off between memory footprint and runtime:
-   - `cg_explicit`: large memory overhead but very fast
+   - `cg_explicit`: large memory overhead but fast
    - `cg_implicit`: slower but requires drastically less memory
 5. Multi-class classification available via one vs. all (also one vs. rest or OAA) and one vs. one (also OAO):
    - OAA: one huge classification task where our CG algorithm solves a system of linear equations with multiple right-hand sides. The resulting model file is **not** compatible with LIBSVM.
@@ -110,7 +110,7 @@ General dependencies:
 - [Pybind11 ≥ v2.13.6](https://github.com/pybind/pybind11) if Python bindings are enabled
 - [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up library utilities (like file parsing)
 - [MPI](https://www.mpi-forum.org/) if distributed memory systems should be supported; [mpi4py](https://mpi4py.readthedocs.io/en/stable/) to enable interoperability in our Python bindings
-- [Format.cmake](https://github.com/TheLartians/Format.cmake) if auto formatting via cmake-format and clang-format is enabled; also requires at least clang-format-18 and git, additionally, needs our custom [cmake-format fork](https://github.com/vancraar/cmake_format) incorporating some patches
+- [Format.cmake](https://github.com/TheLartians/Format.cmake) if auto formatting via cmake-format and clang-format is enabled; it also requires at least clang-format-18 and git, additionally, needs our custom [cmake-format fork](https://github.com/vancraar/cmake_format) incorporating some patches
 - multiple Python modules used in the utility scripts, to install all modules use `pip install --user -r install/python_requirements.txt`
 
 Additional dependencies for the OpenMP backend:
@@ -355,7 +355,7 @@ If the SYCL implementation is DPC++/icpx the following additional options are av
 
 - `PLSSVM_SYCL_BACKEND_DPCPP_USE_LEVEL_ZERO` (default: `ON`): use DPC++/icpx's Level-Zero backend instead of its OpenCL backend **(only available if a CPU or Intel GPU is targeted)**
 
-If the SYCL implementation is AdaptiveCpp the following additional option is available:
+If the SYCL implementation is AdaptiveCpp, the following additional option is available:
 
 - `PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP` (default: `ON`): use AdaptiveCpp's new SSCP compilation flow
 
@@ -497,7 +497,7 @@ Our `cmake-format` can be installed via:
 pip install "git+https://github.com/vancraar/cmake_format@master"
 ```
 
-To check whether formatting changes must be applied use: 
+To check whether formatting changes must be applied, one can use: 
 
 ```bash
 cmake --build . --target check-cmake-format
@@ -519,7 +519,7 @@ If doxygen is installed and `PLSSVM_ENABLE_DOCUMENTATION` is set to `ON` the doc
 cmake --build . -- doc
 ```
 
-The documentation of the current state of the main branch can be found [here](https://sc-sgs.github.io/PLSSVM/).
+The documentation of the current main branch can be found [here](https://sc-sgs.github.io/PLSSVM/).
 
 ### Installing
 
@@ -550,13 +550,13 @@ export PYTHONPATH=${CMAKE_INSTALL_PREFIX}/lib:${CMAKE_INSTALL_PREFIX}/lib64:${PY
 
 #### Install via pip
 
-We also support a pip packages that can be used to install our library: 
+We also support a pip package that can be used to install our library: 
 
 ```bash
 pip install plssvm
 ```
 
-This pip install behaves **as if** the CMake `all_python` preset is used. 
+This pip installation behaves **as if** the CMake `all_python` preset is used. 
 This means that the `PLSSVM_TARGET_PLATFORMS` are automatically determined and PLSSVM is build with all supported 
 backends that available on the target machine at the point of the `pip install plssvm` invocation. 
 To check the installation, including, e.g., the installed backends, we provide the `plssvm-install-check` command after 
@@ -594,7 +594,7 @@ For more information, see the respective `man` pages which are installed via `cm
 
 The repository comes with a Python3 script (in the `utility_scripts/` directory) to simply generate arbitrarily large classification and regression data sets.
 
-In order to use all functionality, the following Python3 modules must be installed:
+To use all functionality, the following Python3 modules must be installed:
 [`argparse`](https://docs.python.org/3/library/argparse.html), [`timeit`](https://docs.python.org/3/library/timeit.html), 
 [`numpy`](https://pypi.org/project/numpy/), [`pandas`](https://pypi.org/project/pandas/),
 [`sklearn`](https://scikit-learn.org/stable/), [`arff`](https://pypi.org/project/arff/),
@@ -643,7 +643,7 @@ optional arguments:
 
 ```
 
-An example invocation generating a classification data set consisting of blobs with 1000 data points with 200 features each and 
+An example invocation generating a classification data set consisting of blobs with 1000 data points with 200 features and 
 4 classes could look like:
 
 ```bash
@@ -709,7 +709,7 @@ Usage:
 The help message only print options available based on the CMake invocation. 
 For example, if CUDA was not available during the build step, it will not show up as possible backend in the description of the `--backend` option.
 
-The most minimal example invocation is:
+The most minimal example of an invocation is:
 
 ```bash
 ./plssvm-train /path/to/data_file
@@ -734,7 +734,7 @@ The `--backend=automatic` option works as follows:
 - otherwise, if the `gpu_intel` target is available, check for existing backends in order `sycl` 🠦 `opencl` 🠦 `kokkos` 🠦 `stdpar`
 - otherwise, if the `cpu` target is available, check for existing backends in order `sycl` 🠦 `kokkos` 🠦 `opencl` 🠦 `openmp` 🠦 `hpx` 🠦 `stdpar`
 
-Note that during CMake configuration it is guaranteed that at least one of the above combinations does exist.
+Note that during CMake configuration, it is guaranteed that at least one of the above combinations does exist.
 
 The `--target_platform=automatic` option works for the different backends as follows:
 
@@ -754,11 +754,12 @@ If the `--kokkos_execution_space` is `automatic`, uses the best fitting executio
 
 ### Predicting using `plssvm-predict`
 
-Our predict utility is fully conform to LIBSVM's model files. 
+Our `plssvm-predict` utility is fully conforming to LIBSVM's model files. 
 This means that our `plssvm-predict` can be used on model files learned with, e.g., LIBSVM's `svm-train`. 
 Note: this is not the case for the regression task since the `svm_type` filed mismatch between LIBSVM (`epsilon_svr`) 
-and PLSSVM (`c_svr`). To automatically convert between the two, simply use the `convert_model.py` script 
-(in the `utility_scripts/` directory) which simply replaces these fields with the respectively expected one 
+and PLSSVM (`c_svr`). 
+To automatically convert between the two, the `convert_model.py` script (in the `utility_scripts/` directory) 
+can be used which simply replaces these fields with the respectively expected one 
 (note that for large files doing that manually may be faster):
 
 ```bash
@@ -872,7 +873,7 @@ An example invocation to scale a train and test file in the same way looks like:
 ### Distributed Memory Support via MPI
 
 We support distributed memory via MPI for `plssvm-train` and `plssvm-predict` while simultaneously allowing multiple devices per MPI rank.
-In order to use it, MPI must be found during the CMake configuration step.
+To use MPI, it must be found during the CMake configuration step.
 Note that if MPI couldn't be found, PLSSVM still works in shared memory mode only and internally disables all MPI related functionality.
 For example, to run PLSSVM via MPI on four nodes simply use the normal `mpirun` command:
 
@@ -892,17 +893,17 @@ Note that the number of provided load balancing weights must be equal to the use
 If one MPI rank has more than one device, all these devices on one MPI rank compute the same number of matrix elements. 
 
 Our MPI implementation, however, currently has some limitations:
-- the training, test, and model data is fully read by **every** MPI rank
-- the training, test, and model data is fully stored on **each** compute device on **every** MPI rank
+- **every** MPI rank fully reads the training, test, and model data
+- **each** compute device on **every** MPI rank fully stores the training, test, and model data
 - **only** the kernel matrix is really divided across **all** MPI ranks
 - while the expensive BLAS level 3 operations in the CG algorithm are computed in a distributed way, everything else is computed on **every** MPI rank
-- in the CG algorithm we communicate the whole matrix, although it would be sufficient to communicate only matrix parts
+- in the CG algorithm we communicate the whole matrix, although it would be enough to communicate only matrix parts
 - **only** the **main** MPI rank (per default rank 0) writes the output files
 - `plssvm-scale` **does not** support more than one MPI rank
 
 ### Example Code for PLSSVM Used as a Library
 
-A simple C++ program (`main_classification.cpp`) using PLSSVM as library for classification could look like:
+A simple C++ program (`main_classification.cpp`) using PLSSVM as a library for classification could look like:
 
 ```cpp
 #include "plssvm/core.hpp"
@@ -940,7 +941,7 @@ int main() {
         const std::vector<int> &correct_label = test_data.labels().value();
         std::cout << plssvm::classification_report{ correct_label, predicted_label } << std::endl;
 
-        // write model file to disk
+        // write the model file to disk
         model.save("model_file.libsvm");
     } catch (const plssvm::exception &e) {
         std::cerr << e.what_with_loc() << std::endl;
@@ -952,7 +953,7 @@ int main() {
 }
 ```
 
-A simple C++ program (`main_regression.cpp`) using PLSSVM as library for regression could look like:
+A simple C++ program (`main_regression.cpp`) using PLSSVM as a library for regression could look like:
 
 ```cpp
 #include "plssvm/core.hpp"
@@ -990,7 +991,7 @@ int main() {
         const std::vector<plssvm::real_type> &correct_values = test_data.labels().value();
         std::cout << plssvm::regression_report{ correct_label, predicted_label } << std::endl;
 
-        // write model file to disk
+        // write the model file to disk
         model.save("model_file.libsvm");
     } catch (const plssvm::exception &e) {
         std::cerr << e.what_with_loc() << std::endl;
@@ -1089,7 +1090,7 @@ plt.scatter(X[:, 0], X[:, 1],
             c=y,
             s=20, edgecolors="k")
 
-# generate legend handles and add handle
+# generate legend handles
 legend_handles = [plt.scatter([], [], color=viridis(color), label=f'{label}')
                   for label, color in zip(y_label, np.unique(y))]
 plt.legend(handles=legend_handles)
@@ -1110,7 +1111,7 @@ weighted avg       0.91      0.91      0.91       569
 
 Score: 91.39%
 ```
-<p align="center">
+<p style="text-align: center;">
   <img alt="Example classification task breast cancer decision boundary output." src="https://github.com/SC-SGS/PLSSVM/raw/regression/.figures/classification_example.png" width="80%">
 </p>
 
@@ -1177,7 +1178,7 @@ plt.legend()
 plt.show()
 ```
 with an example output:
-<p align="center">
+<p style="text-align: center;">
   <img alt="Example regression output using a sine curve." src="https://github.com/SC-SGS/PLSSVM/raw/regression/.figures/regression_example.png" width="80%">
 </p>
 

From 39ecc5ad40d2b9d8116b770be254311cfe2fd861 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 10 Jul 2025 20:13:40 +0200
Subject: [PATCH 80/93] State that we support the HPX and Kokkos specific
 command line options by forwarding them to the respective initialization
 functions.

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 24efdb785..0bdc616ce 100644
--- a/README.md
+++ b/README.md
@@ -588,7 +588,11 @@ Issues: https://github.com/SC-SGS/PLSSVM/issues
 
 PLSSVM provides three executables: `plssvm-train`, `plssvm-predict`, and `plssvm-scale`.
 In addition, PLSSVM can also be used as a library in third-party code.
-For more information, see the respective `man` pages which are installed via `cmake --build . -- install`.
+For more information, see the respective `man` pages which are installed via `cmake --build . -- install`. 
+
+We support the command line options of the third-party libraries [HPX](https://hpx.stellar-group.org/) and [Kokkos](https://github.com/kokkos/kokkos) 
+by forwarding the command line options to the respective initialization functions. 
+Internally, these options are filtered out before they are passed to our command line parser utility. 
 
 ### Generating Artificial Data
 

From 321ac8125e6e7f03a2e58f05a2c61a6f0165f64f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 10 Jul 2025 20:27:38 +0200
Subject: [PATCH 81/93] Update description of the PLSSVM_THREAD_BLOCK_SIZE
 behavior CMake option.

---
 README.md                           | 11 +++++++++--
 src/plssvm/backends/Kokkos/csvm.cpp |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0bdc616ce..96a4b3ed4 100644
--- a/README.md
+++ b/README.md
@@ -297,8 +297,15 @@ The `[optional_options]` can be one or multiple of:
 - `PLSSVM_ENABLE_FAST_MATH=ON|OFF` (default depending on `CMAKE_BUILD_TYPE`: `ON` for Release or RelWithDebInfo, `OFF` otherwise): enable `fast-math` compiler flags for all backends
 - `PLSSVM_ENABLE_ASSERTS=ON|OFF` (default: `OFF`): enables custom assertions
 - `PLSSVM_USE_FLOAT_AS_REAL_TYPE=ON|OFF` (default: `OFF`): use `float` as real_type instead of `double`
-- `PLSSVM_THREAD_BLOCK_SIZE` (default: `8`): set a specific thread block size used in the GPU kernels (for fine-tuning optimizations)
-- `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `4`): set a specific internal block size used in the GPU kernels (for fine-tuning optimizations)
+- `PLSSVM_THREAD_BLOCK_SIZE` (default: `8`): set a specific thread block size used in the kernels (for fine-tuning optimizations) <br>
+   **Note**: for the different execution spaces in the Kokkos backend, the maximum value of the `PLSSVM_THREAD_BLOCK_SIZE` is not as straight forward as one may wish:
+  - CUDA, HIP, and SYCL: the maximum value depends on the underlying backend (in practice $\sqrt{1024}$ = 32)
+  - HPX and Serial: must **exactly** be 1
+  - OpenMP: must be 1 or 2 (most likely only 1 will work)
+  - Threads: must be 1; however, note that Kokkos itself **must** be built with hwloc support (via `-DKokkos_ENABLE_HWLOC=ON`), otherwise the Kokkos::Threads execution space will always only use a single core
+  - OpenMPTarget: $\sqrt{256}$ = 16
+  - OpenACC: $\lfloor\sqrt{512}\rfloor$ = 22
+- `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `4`): set a specific internal block size used in the kernels (for fine-tuning optimizations)
 - `PLSSVM_ENABLE_LTO=ON|OFF` (default: `OFF`): enable interprocedural optimization (IPO/LTO) if supported by the compiler
 - `PLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=ON|OFF` (default: `ON`): enforce the maximum (device) memory allocation size for the plssvm::solver_type::automatic solver
 - `PLSSVM_ENABLE_PINNED_MEMORY=ON|OFF` (default: `OFF`): use host pinned memory for the input matrix when assembling the kernel matrix, if available
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 1e0ca96a5..3c10029b5 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -437,7 +437,7 @@ std::size_t csvm::get_max_work_group_size(const std::size_t device_id) const {
         // NOTE: CUDA + HIP + SYCL: returns the maximum possible number of threads, due to no further limitations in the dummy functor (like, e.g., scratch memory)
         // NOTE: HPX + Serial: hardcoded to 1
         // NOTE: OpenMP: should be 1-2; most likely 1
-        // NOTE: Threads: should be equal to number of hardware threads IF hwloc is enabled; otherwise 1
+        // NOTE: Threads: should be 1; however, always only uses a single core if Kokkos wasn't built with hwloc support
         // NOTE: OpenMPTarget: hardcoded to 256
         // NOTE: OpenACC: hardcoded to 512
 

From 2b3e11f5fbd09d47b10553f978455de54fa2fc5b Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Thu, 10 Jul 2025 20:29:50 +0200
Subject: [PATCH 82/93] Undo align center changes.

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 96a4b3ed4..96baf4ded 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ This is also the reason, why SVMs are also called "large margin classifiers."
 To predict to which class a new, unseen data point belongs, the SVM simply has to calculate on which side of the previously calculated hyperplane the data point lies.
 This is very efficient since it only involves a single scalar product of the size corresponding to the numer of features per data point in the data set.
 
-<p style="text-align: center;">
+<p align="center">
   <img alt="Basic idea of an Support Vector Machine as classification model." src="https://github.com/SC-SGS/PLSSVM/raw/main/.figures/support_vector_machine.png" width="50%">
 </p>
 
@@ -1122,7 +1122,7 @@ weighted avg       0.91      0.91      0.91       569
 
 Score: 91.39%
 ```
-<p style="text-align: center;">
+<p align="center">
   <img alt="Example classification task breast cancer decision boundary output." src="https://github.com/SC-SGS/PLSSVM/raw/regression/.figures/classification_example.png" width="80%">
 </p>
 
@@ -1189,7 +1189,7 @@ plt.legend()
 plt.show()
 ```
 with an example output:
-<p style="text-align: center;">
+<p align="center">
   <img alt="Example regression output using a sine curve." src="https://github.com/SC-SGS/PLSSVM/raw/regression/.figures/regression_example.png" width="80%">
 </p>
 

From dd15efbffb5b35468db7f2efd3a200427a95645d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 12 Jul 2025 17:27:40 +0200
Subject: [PATCH 83/93] Update include documentation.

---
 src/plssvm/backends/OpenCL/detail/utility.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp
index 4dd6a3499..dd899aa9f 100644
--- a/src/plssvm/backends/OpenCL/detail/utility.cpp
+++ b/src/plssvm/backends/OpenCL/detail/utility.cpp
@@ -31,7 +31,7 @@
 #include "CL/cl.h"           // cl_program, cl_platform_id, cl_device_id, cl_uint, cl_device_type, cl_context,
                              // CL_DEVICE_NAME, CL_QUEUE_DEVICE, CL_DEVICE_TYPE_ALL, CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_GPU, CL_DEVICE_VENDOR, CL_PROGRAM_BUILD_LOG, CL_PROGRAM_BINARY_SIZES, CL_PROGRAM_BINARIES, CL_PLATFORM_VENDOR, CL_DRIVER_VERSION,
                              // clCreateProgramWithSource, clBuildProgram, clGetProgramBuildInfo, clGetProgramInfo, clCreateKernel, clReleaseProgram, clCreateProgramWithBinary,
-                             // clSetKernelArg, clEnqueueNDRangeKernel, clFinish, clGetPlatformIDs, clGetDeviceIDs, clGetDeviceInfo, clCreateContext, clGetPlatformInfo
+                             // clFinish, clGetPlatformIDs, clGetDeviceIDs, clGetDeviceInfo, clCreateContext, clGetPlatformInfo
 #include "CL/cl_platform.h"  // cl_uint
 
 #include "fmt/format.h"  // fmt::format

From cb5c48555fe5548ac8a395b7026d57ac4fba5d7c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 12 Jul 2025 19:09:02 +0200
Subject: [PATCH 84/93] Update includes.

---
 include/plssvm/backends/gpu_device_ptr.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/plssvm/backends/gpu_device_ptr.hpp b/include/plssvm/backends/gpu_device_ptr.hpp
index 78729691f..a61e434fe 100644
--- a/include/plssvm/backends/gpu_device_ptr.hpp
+++ b/include/plssvm/backends/gpu_device_ptr.hpp
@@ -19,8 +19,12 @@
 #include "plssvm/matrix.hpp"                 // plssvm::layout_type, plssvm::matrix
 #include "plssvm/shape.hpp"                  // plssvm::shape
 
-#include <cstddef>  // std::size_t
-#include <vector>   // std::vector
+#include "fmt/format.h"  // fmt::format
+
+#include <algorithm>  // std::min
+#include <cstddef>    // std::size_t
+#include <utility>    // std::swap, std::exchange
+#include <vector>     // std::vector
 
 namespace plssvm::detail {
 

From 25b2922f42566fbdf4d66f0467656349e74aff0a Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sat, 12 Jul 2025 21:26:51 +0200
Subject: [PATCH 85/93] Reimplement the OpenCL device_ptr memset and fill
 functions using custom kernels since the previous version using
 clEnqueueFillBuffer failed for SOME data sets on NVIDIA GPUs.

---
 .../plssvm/backends/OpenCL/detail/kernel.hpp  |  8 +++
 .../OpenCL/kernel/detail/fill_kernel.cl       | 40 +++++++++++
 .../OpenCL/kernel/detail/memset_kernel.cl     | 46 +++++++++++++
 src/plssvm/backends/OpenCL/CMakeLists.txt     |  3 +
 src/plssvm/backends/OpenCL/csvm.cpp           | 14 +++-
 .../backends/OpenCL/detail/device_ptr.cpp     | 68 ++++++++++++++-----
 src/plssvm/backends/OpenCL/detail/utility.cpp | 10 ++-
 tests/backends/OpenCL/detail/device_ptr.cpp   | 10 ++-
 8 files changed, 177 insertions(+), 22 deletions(-)
 create mode 100644 include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl
 create mode 100644 include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl

diff --git a/include/plssvm/backends/OpenCL/detail/kernel.hpp b/include/plssvm/backends/OpenCL/detail/kernel.hpp
index 2a54c5769..600af7bfa 100644
--- a/include/plssvm/backends/OpenCL/detail/kernel.hpp
+++ b/include/plssvm/backends/OpenCL/detail/kernel.hpp
@@ -22,6 +22,14 @@ namespace plssvm::opencl::detail {
  * @details Used to distinguish kernels in the plssvm::opencl::detail::command_queue class.
  */
 enum class compute_kernel_name {
+    /// The kernel to fill a float device pointer with a provided value.
+    fill_kernel_float,
+    /// The kernel to fill a double device pointer with a provided value.
+    fill_kernel_double,
+    /// The kernel to memset a float device pointer with a provided pattern.
+    memset_kernel_float,
+    /// The kernel to memset a double device pointer with a provided pattern.
+    memset_kernel_double,
     /// The kernels to explicitly assemble the kernel matrix.
     assemble_kernel_matrix_explicit,
     /// The kernel performing a explicit BLAS SYMM calculation.
diff --git a/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl b/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl
new file mode 100644
index 000000000..76c0ba424
--- /dev/null
+++ b/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl
@@ -0,0 +1,40 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Implement a fill kernel using OpenCL.
+ */
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/**
+ * @brief Fill the float data pointer @p data with the @p value.
+ * @param[out] data the pointer to fill with values
+ * @param[in] value the value used to fill @p data
+ * @param[in] pos the start position for filling @p data
+ * @param[in] size the number of elements in @p data
+ */
+__kernel void device_fill_kernel_float(__global float *data, const float value, const ulong pos, const ulong size) {
+    const ulong idx = get_global_id(0);
+    if (idx < size) {
+        data[pos + idx] = value;
+    }
+}
+
+/**
+ * @brief Fill the double data pointer @p data with the @p value.
+ * @param[out] data the pointer to fill with values
+ * @param[in] value the value used to fill @p data
+ * @param[in] pos the start position for filling @p data
+ * @param[in] size the number of elements in @p data
+ */
+__kernel void device_fill_kernel_double(__global double *data, const double value, const ulong pos, const ulong size) {
+    const ulong idx = get_global_id(0);
+    if (idx < size) {
+        data[pos + idx] = value;
+    }
+}
diff --git a/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl b/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl
new file mode 100644
index 000000000..88b4f67e1
--- /dev/null
+++ b/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl
@@ -0,0 +1,46 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Implement a memset kernel using OpenCL.
+ */
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/**
+ * @brief Memset the float data pointer @p data with the @p value.
+ * @param[out] data the pointer to memset with the pattern
+ * @param[in] pattern the pattern used to memset @p data
+ * @param[in] pos the start position for the memset operation on @p data
+ * @param[in] size the number of elements in @p data
+ */
+__kernel void device_memset_kernel_float(__global float *data, const uchar pattern, const ulong pos, const ulong size) {
+    const ulong idx = get_global_id(0);
+    if (idx < size) {
+        // pack the 1-Byte pattern into a 4-Byte uint
+        const uint packed_pattern = (pattern << 24) | (pattern << 16) | (pattern << 8) | pattern;
+        // bitwise cast the uint to a float
+        data[pos + idx] = as_float(packed_pattern);
+    }
+}
+
+/**
+ * @brief Memset the double data pointer @p data with the @p value.
+ * @param[out] data the pointer to memset with the pattern
+ * @param[in] pattern the pattern used to memset @p data
+ * @param[in] pos the start position for the memset operation on @p data
+ * @param[in] size the number of elements in @p data
+ */
+__kernel void device_memset_kernel_double(__global double *data, const uchar pattern, const ulong pos, const ulong size) {
+    const ulong idx = get_global_id(0);
+    if (idx < size) {
+        // pack the 1-Byte pattern into an 8-Byte ulong
+        const ulong packed_pattern = ((ulong) pattern << 56) | ((ulong) pattern << 48) | ((ulong) pattern << 40) | ((ulong) pattern << 32) | ((ulong) pattern << 24) | ((ulong) pattern << 16) | ((ulong) pattern << 8) | ((ulong) pattern);
+        // bitwise cast th ulong to a double
+        data[pos + idx] = as_double(packed_pattern);
+    }
+}
diff --git a/src/plssvm/backends/OpenCL/CMakeLists.txt b/src/plssvm/backends/OpenCL/CMakeLists.txt
index c79869c05..04c63b213 100644
--- a/src/plssvm/backends/OpenCL/CMakeLists.txt
+++ b/src/plssvm/backends/OpenCL/CMakeLists.txt
@@ -47,6 +47,9 @@ target_sources(
             BASE_DIRS
             "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel"
             FILES
+            "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/detail/atomics.cl"
+            "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/detail/fill_kernel.cl"
+            "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/detail/memset_kernel.cl"
             "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/cg_explicit/blas.cl"
             "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/cg_explicit/kernel_matrix_assembly.cl"
             "${PROJECT_SOURCE_DIR}/include/plssvm/backends/OpenCL/kernel/cg_implicit/kernel_matrix_assembly_blas.cl"
diff --git a/src/plssvm/backends/OpenCL/csvm.cpp b/src/plssvm/backends/OpenCL/csvm.cpp
index 0c58a48d6..448388ee2 100644
--- a/src/plssvm/backends/OpenCL/csvm.cpp
+++ b/src/plssvm/backends/OpenCL/csvm.cpp
@@ -163,8 +163,18 @@ csvm::csvm(const target_platform target) {
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "jit_compilation_time", info.duration }));
 
     // sanity checks for the number of the OpenCL kernels
-    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return queue.kernels.size() == 13; }),
-                  "Every command queue must have exactly thirteen associated kernels!");
+    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return queue.kernels.size() == 17; }),
+                  "Every command queue must have exactly 17 associated kernels!");
+
+    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return ::plssvm::detail::contains(queue.kernels, detail::compute_kernel_name::fill_kernel_float); }),
+                  "The double device pointer fill kernel is missing!");
+    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return ::plssvm::detail::contains(queue.kernels, detail::compute_kernel_name::fill_kernel_double); }),
+                  "The float device pointer fill kernel is missing!");
+
+    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return ::plssvm::detail::contains(queue.kernels, detail::compute_kernel_name::memset_kernel_float); }),
+                  "The double device pointer memset kernel is missing!");
+    PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return ::plssvm::detail::contains(queue.kernels, detail::compute_kernel_name::memset_kernel_double); }),
+                  "The float device pointer memset kernel is missing!");
 
     PLSSVM_ASSERT(std::all_of(devices_.begin(), devices_.end(), [](const queue_type &queue) { return ::plssvm::detail::contains(queue.kernels, detail::compute_kernel_name::assemble_kernel_matrix_explicit); }),
                   "The explicit kernel matrix assembly device kernel is missing!");
diff --git a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
index 032a6b8b6..a4aafa3ac 100644
--- a/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
+++ b/src/plssvm/backends/OpenCL/detail/device_ptr.cpp
@@ -11,22 +11,26 @@
 #include "plssvm/backends/gpu_device_ptr.hpp"               // plssvm::detail::gpu_device_ptr
 #include "plssvm/backends/OpenCL/detail/command_queue.hpp"  // plssvm::opencl::detail::command_queue
 #include "plssvm/backends/OpenCL/detail/error_code.hpp"     // plssvm::opencl::detail::error_code
+#include "plssvm/backends/OpenCL/detail/kernel.hpp"         // plssvm::opencl::detail::{kernel, compute_kernel_name}
 #include "plssvm/backends/OpenCL/detail/utility.hpp"        // PLSSVM_OPENCL_ERROR_CHECK
 #include "plssvm/backends/OpenCL/exceptions.hpp"            // plssvm::opencl::backend_exception
 #include "plssvm/detail/assert.hpp"                         // PLSSVM_ASSERT
+#include "plssvm/detail/type_traits.hpp"                    // plssvm::detail::always_false_v
 #include "plssvm/exceptions/exceptions.hpp"                 // plssvm::exception
 #include "plssvm/shape.hpp"                                 // plssvm::shape
 
-#include "CL/cl.h"  // CL_MEM_READ_WRITE, CL_TRUE, clFinish, clCreateBuffer, clReleaseMemObject, clEnqueueFillBuffer, clEnqueueWriteBuffer, clEnqueueReadBuffer
+#include "CL/cl.h"  // cl_uchar, cl_ulong, cl_mem, CL_MEM_READ_WRITE, CL_TRUE,
+                    // clFinish, clCreateBuffer, clReleaseMemObject, clEnqueueFillBuffer, clEnqueueWriteBuffer, clEnqueueReadBuffer, clSetKernelArg, clEnqueueNDRangeKernel
 
 #include "fmt/format.h"  // fmt::format
 
-#include <algorithm>  // std::min
-#include <array>      // std::array
-#include <cstddef>    // std::size_t
-#include <exception>  // std::terminate
-#include <iostream>   // std::cerr, std::endl
-#include <vector>     // std::vector
+#include <algorithm>    // std::min
+#include <array>        // std::array
+#include <cstddef>      // std::size_t
+#include <exception>    // std::terminate
+#include <iostream>     // std::cerr, std::endl
+#include <type_traits>  // std::is_same_v
+#include <vector>       // std::vector
 
 namespace plssvm::opencl::detail {
 
@@ -70,10 +74,29 @@ void device_ptr<T>::memset(const int pattern, const size_type pos, const size_ty
         throw backend_exception{ fmt::format("Illegal access in memset!: {} >= {}", pos, this->size_padded()) };
     }
     const size_type rnum_bytes = std::min(num_bytes, (this->size_padded() - pos) * sizeof(value_type));
-    error_code err;
-    const auto correct_value = static_cast<unsigned char>(pattern);
-    err = clEnqueueFillBuffer(queue_->queue, data_, &correct_value, sizeof(unsigned char), pos * sizeof(value_type), rnum_bytes, 0, nullptr, nullptr);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via memset")
+
+    // we have to use ul_char for the correct pattern
+    const auto correct_pattern = static_cast<cl_uchar>(pattern);
+    // we have to use the number of elements and not the number of bytes
+    const auto rcount = static_cast<cl_ulong>(rnum_bytes / sizeof(value_type));
+
+    // get the correct device kernel based on the current value_type
+    const kernel *device_kernel = nullptr;
+    if constexpr (std::is_same_v<value_type, float>) {
+        device_kernel = &queue_->get_kernel(detail::compute_kernel_name::memset_kernel_float);
+    } else if constexpr (std::is_same_v<value_type, double>) {
+        device_kernel = &queue_->get_kernel(detail::compute_kernel_name::memset_kernel_double);
+    } else {
+        static_assert(plssvm::detail::always_false_v<T>, "Unsupported value type!");
+    }
+    PLSSVM_ASSERT(device_kernel != nullptr, "The device kernel pointer is invalid!");
+
+    // set the kernel arguments and run the kernel
+    PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 0 }, sizeof(cl_mem), &data_), "error setting device_ptr memset data_ argument");
+    PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 1 }, sizeof(cl_uchar), &correct_pattern), "error setting device_ptr memset pattern argument");
+    PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 2 }, sizeof(cl_ulong), &pos), "error setting device_ptr memset pos argument");
+    PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 3 }, sizeof(cl_ulong), &rcount), "error setting device_ptr memset size argument");
+    PLSSVM_OPENCL_ERROR_CHECK(clEnqueueNDRangeKernel(queue_->queue, *device_kernel, cl_uint{ 1 }, nullptr, &rcount, nullptr, 0, nullptr, nullptr), "error running device_ptr memset kernel");
     detail::device_synchronize(*queue_);
 }
 
@@ -84,12 +107,25 @@ void device_ptr<T>::fill(const value_type value, const size_type pos, const size
     if (pos >= this->size_padded()) {
         throw backend_exception{ fmt::format("Illegal access in fill!: {} >= {}", pos, this->size_padded()) };
     }
-
-    // run GPU kernel
     const size_type rcount = std::min(count, this->size_padded() - pos);
-    error_code err;
-    err = clEnqueueFillBuffer(queue_->queue, data_, &value, sizeof(value_type), pos * sizeof(value_type), rcount * sizeof(value_type), 0, nullptr, nullptr);
-    PLSSVM_OPENCL_ERROR_CHECK(err, "error filling the buffer via fill")
+
+    // get the correct device kernel based on the current value_type
+    const kernel *device_kernel = nullptr;
+    if constexpr (std::is_same_v<value_type, float>) {
+        device_kernel = &queue_->get_kernel(detail::compute_kernel_name::fill_kernel_float);
+    } else if constexpr (std::is_same_v<value_type, double>) {
+        device_kernel = &queue_->get_kernel(detail::compute_kernel_name::fill_kernel_double);
+    } else {
+        static_assert(plssvm::detail::always_false_v<T>, "Unsupported value type!");
+    }
+    PLSSVM_ASSERT(device_kernel != nullptr, "The device kernel pointer is invalid!");
+
+    // set the kernel arguments and run the kernel
+    PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 0 }, sizeof(cl_mem), &data_), "error setting device_ptr fill data_ argument");
+    PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 1 }, sizeof(value_type), &value), "error setting device_ptr fill pattern argument");
+    PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 2 }, sizeof(cl_ulong), &pos), "error setting device_ptr fill pos argument");
+    PLSSVM_OPENCL_ERROR_CHECK(clSetKernelArg(*device_kernel, cl_uint{ 3 }, sizeof(cl_ulong), &rcount), "error setting device_ptr fill size argument");
+    PLSSVM_OPENCL_ERROR_CHECK(clEnqueueNDRangeKernel(queue_->queue, *device_kernel, cl_uint{ 1 }, nullptr, &rcount, nullptr, 0, nullptr, nullptr), "error running device_ptr fill kernel");
     detail::device_synchronize(*queue_);
 }
 
diff --git a/src/plssvm/backends/OpenCL/detail/utility.cpp b/src/plssvm/backends/OpenCL/detail/utility.cpp
index dd899aa9f..20837a667 100644
--- a/src/plssvm/backends/OpenCL/detail/utility.cpp
+++ b/src/plssvm/backends/OpenCL/detail/utility.cpp
@@ -191,6 +191,12 @@ std::string get_device_name(const command_queue &queue) {
 std::vector<std::pair<compute_kernel_name, std::string>> kernel_type_to_function_names() {
     // since the correct predict kernel function cannot be determined during construction, add all predict kernels
     std::vector<std::pair<compute_kernel_name, std::string>> kernels{
+        // fill_kernel.cl
+        std::make_pair(compute_kernel_name::fill_kernel_float, "device_fill_kernel_float"),
+        std::make_pair(compute_kernel_name::fill_kernel_double, "device_fill_kernel_double"),
+        // memset_kernel.cl
+        std::make_pair(compute_kernel_name::memset_kernel_float, "device_memset_kernel_float"),
+        std::make_pair(compute_kernel_name::memset_kernel_double, "device_memset_kernel_double"),
         // kernel_matrix_assembly.cl
         std::make_pair(compute_kernel_name::assemble_kernel_matrix_explicit, "device_kernel_assembly"),
         // blas.cl
@@ -281,6 +287,8 @@ std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi:
     std::string kernel_src_string{};
     // note: the detail/atomics.cl file must be included first!
     for (const auto &path : { base_path / "detail/atomics.cl",
+                              base_path / "detail/fill_kernel.cl",
+                              base_path / "detail/memset_kernel.cl",
                               base_path / "kernel_functions.cl",
                               base_path / "cg_explicit/blas.cl",
                               base_path / "cg_explicit/kernel_matrix_assembly.cl",
@@ -551,8 +559,8 @@ std::pair<std::vector<command_queue>, jit_info> create_command_queues(const mpi:
     }
 
     std::vector<command_queue> queues{};
-    // compile kernels for each context, i.e., each device
 
+    // compile kernels for each context, i.e., each device
     for (std::size_t idx = 0; idx < contexts.size(); ++idx) {
         auto &context = contexts[idx];
         auto &device = context.device;
diff --git a/tests/backends/OpenCL/detail/device_ptr.cpp b/tests/backends/OpenCL/detail/device_ptr.cpp
index aa5f678c6..d641213b0 100644
--- a/tests/backends/OpenCL/detail/device_ptr.cpp
+++ b/tests/backends/OpenCL/detail/device_ptr.cpp
@@ -12,7 +12,9 @@
 
 #include "plssvm/backends/OpenCL/detail/command_queue.hpp"  // plssvm::opencl::detail::command_queue
 #include "plssvm/backends/OpenCL/detail/context.hpp"        // plssvm::opencl::detail::context
-#include "plssvm/backends/OpenCL/detail/utility.hpp"        // plssvm::opencl::detail::get_contexts
+#include "plssvm/backends/OpenCL/detail/utility.hpp"        // plssvm::opencl::detail::{get_contexts, create_command_queues}
+#include "plssvm/kernel_function_types.hpp"                 // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                      // plssvm::determine_default_target_platform
 
 #include "tests/backends/generic_device_ptr_tests.hpp"  // generic device pointer tests to instantiate
 #include "tests/naming.hpp"                             // naming::test_parameter_to_name
@@ -29,8 +31,10 @@ struct opencl_device_ptr_test_type {
     using queue_type = plssvm::opencl::detail::command_queue;
 
     static const queue_type &default_queue() {
-        static const std::vector<plssvm::opencl::detail::context> contexts{ plssvm::opencl::detail::get_contexts(plssvm::target_platform::automatic).first };
-        static const plssvm::opencl::detail::command_queue queue{ contexts[0], contexts[0].device };
+        static const std::vector<plssvm::opencl::detail::context> contexts{ plssvm::opencl::detail::get_contexts(plssvm::determine_default_target_platform()).first };
+        // note: the kernel_function_type doesn't matter for the device_ptr tests!
+        static const auto command_queues{ plssvm::opencl::detail::create_command_queues({}, contexts, plssvm::determine_default_target_platform(), plssvm::kernel_function_type::linear) };
+        static const plssvm::opencl::detail::command_queue &queue{ command_queues.first.front() };
         return queue;
     }
 };

From 88c69a3a91aa2844676da1e753a1a896012f6b4f Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 13 Jul 2025 14:13:54 +0200
Subject: [PATCH 86/93] Fix decltype error.

---
 src/plssvm/detail/cmd/utility.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plssvm/detail/cmd/utility.cpp b/src/plssvm/detail/cmd/utility.cpp
index 3ee51b0a7..23c44797f 100644
--- a/src/plssvm/detail/cmd/utility.cpp
+++ b/src/plssvm/detail/cmd/utility.cpp
@@ -91,7 +91,7 @@ std::optional<std::pair<sycl::data_parallel_kernel, sycl::implementation_type>>
     }
 
     // parse the SYCL implementation used in the SYCL backend
-    const sycl::implementation_type implementation_type = result["sycl_implementation_type"].as<decltype(sycl_implementation_type)>();
+    const sycl::implementation_type implementation_type = result["sycl_implementation_type"].as<sycl::implementation_type>();
 
     // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
     if (!sycl_backend_is_used && implementation_type != sycl::implementation_type::automatic) {

From 9cb953fcafe2d777020d62c32a0a90c6a49aeed5 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 20 Jul 2025 12:14:41 +0200
Subject: [PATCH 87/93] Update documentation to reflect the new solver type.

---
 README.md                              | 5 +++--
 docs/plssvm-train.1.in                 | 2 +-
 src/plssvm/detail/cmd/parser_train.cpp | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 96baf4ded..cb0efd1a6 100644
--- a/README.md
+++ b/README.md
@@ -82,8 +82,9 @@ The main highlights of our SVM implementations are:
    - sigmoid: $\tanh(\gamma$ $\cdot$ $\vec{u}^T$ $\cdot$ $\vec{v}$ $+$ $coef0)$
    - laplacian: $\exp(-\gamma$ $\cdot |$ $\vec{u}$ $-$ $\vec{v}$ $|_1)$
    - chi-squared (only well-defined for values > 0): $\exp(-\gamma \cdot \sum_i \frac{(x[i] - y[i])^2}{x[i] + y[i]})$
-4. Two different solver types for a trade-off between memory footprint and runtime:
+4. Three different solver types for a trade-off between memory footprint and runtime:
    - `cg_explicit`: large memory overhead but fast
+   - `cg_streaming`: the respective runtime automatically handles the memory migrations but may reduce the performance (implemented via unified shared memory)
    - `cg_implicit`: slower but requires drastically less memory
 5. Multi-class classification available via one vs. all (also one vs. rest or OAA) and one vs. one (also OAO):
    - OAA: one huge classification task where our CG algorithm solves a system of linear equations with multiple right-hand sides. The resulting model file is **not** compatible with LIBSVM.
@@ -693,7 +694,7 @@ Usage:
   -c, --cost arg                set the parameter C (default: 1)
   -e, --epsilon arg             set the tolerance of termination criterion (default: 1e-10)
   -i, --max_iter arg            set the maximum number of CG iterations (default: num_features)
-  -l, --solver arg              choose the solver: automatic|cg_explicit|cg_implicit (default: automatic)
+  -l, --solver arg              choose the solver: automatic|cg_explicit|cg_streaming|cg_implicit (default: automatic)
   -a, --classification arg      the classification strategy to use for multi-class classification: oaa|oao (default: oaa)
   -b, --backend arg             choose the backend: automatic|openmp|hpx|cuda|hip|opencl|sycl|kokkos|stdpar (default: automatic)
   -p, --target_platform arg     choose the target platform: automatic|cpu|gpu_nvidia|gpu_amd|gpu_intel (default: automatic)
diff --git a/docs/plssvm-train.1.in b/docs/plssvm-train.1.in
index deae26bc1..f5976c39c 100644
--- a/docs/plssvm-train.1.in
+++ b/docs/plssvm-train.1.in
@@ -48,7 +48,7 @@ the maximum number of CG iterations (default: #features)
 
 .TP
 .B -l, --solver arg
-choose the solver: automatic|cg_explicit|cg_implicit (default: automatic)
+choose the solver: automatic|cg_explicit|cg_streaming|cg_implicit (default: automatic)
 
 .TP
 .B -a, --classification arg
diff --git a/src/plssvm/detail/cmd/parser_train.cpp b/src/plssvm/detail/cmd/parser_train.cpp
index 033f0a0e8..3bb47fb0f 100644
--- a/src/plssvm/detail/cmd/parser_train.cpp
+++ b/src/plssvm/detail/cmd/parser_train.cpp
@@ -73,7 +73,7 @@ parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv)
            ("c,cost", "set the parameter C", cxxopts::value<decltype(csvm_params.cost)>()->default_value(fmt::format("{}", csvm_params.cost)))
            ("e,epsilon", "set the tolerance of termination criterion", cxxopts::value<decltype(epsilon)>()->default_value(fmt::format("{}", epsilon)))
            ("i,max_iter", "set the maximum number of CG iterations (default: num_features)", cxxopts::value<long long int>())
-           ("l,solver", "choose the solver: automatic|cg_explicit|cg_implicit", cxxopts::value<decltype(solver)>()->default_value("automatic"))
+           ("l,solver", "choose the solver: automatic|cg_explicit|cg_streaming|cg_implicit", cxxopts::value<decltype(solver)>()->default_value("automatic"))
            ("a,classification", "the classification strategy to use for multi-class classification: oaa|oao", cxxopts::value<decltype(classification)>()->default_value(fmt::format("{}", classification)))
            ("b,backend", fmt::format("choose the backend: {}", fmt::join(list_available_backends(), "|")), cxxopts::value<decltype(backend)>()->default_value(fmt::format("{}", backend)))
            ("p,target_platform", fmt::format("choose the target platform: {}", fmt::join(list_available_target_platforms(), "|")), cxxopts::value<decltype(target)>()->default_value(fmt::format("{}", target)))

From 9944b7de02ebae36b8f12ff0c95103cc374b2c7c Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 20 Jul 2025 12:15:02 +0200
Subject: [PATCH 88/93] Add support for the new solver type to the Python
 bindings.

---
 bindings/Python/README.md        | 2 +-
 bindings/Python/solver_types.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/bindings/Python/README.md b/bindings/Python/README.md
index 96254920b..bcc2c2c4f 100644
--- a/bindings/Python/README.md
+++ b/bindings/Python/README.md
@@ -321,7 +321,7 @@ The following table lists all PLSSVM enumerations exposed on the Python side:
 | enumeration            | values                                                                           | description                                                                                                                                                                                                                                                 |
 |------------------------|----------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `TargetPlatform`       | `AUTOMATIC`, `CPU`, `GPU_NVIDIA`, `GPU_AMD`, `GPU_INTEL`                         | The different supported target platforms (default: `AUTOMATIC`). If `AUTOMATIC` is provided, checks for available devices in the following order: NVIDIA GPUs -> AMD GPUs -> Intel GPUs -> CPUs.                                                            |
-| `SolverType`           | `AUTOMATIC`, `CG_EXPLICIT`, `CG_IMPLICIT`                                        | The different supported solver types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the used solver types depends on the available device and system memory.                                                                                           |
+| `SolverType`           | `AUTOMATIC`, `CG_EXPLICIT`, `CG_STREAMING`, `CG_IMPLICIT`                        | The different supported solver types (default: `AUTOMATIC`). If `AUTOMATIC` is provided, the used solver types depends on the available device and system memory.                                                                                           |
 | `KernelFunctionType`   | `LINEAR`, `POLYNOMIAL`, `RBF`, `SIGMOID`, `LAPLACIAN`, `CHI_SQUARED`             | The different supported kernel functions (default: `RBF`).                                                                                                                                                                                                  |
 | `FileFormatType`       | `LIBSVM`, `ARFF`                                                                 | The different supported file format types (default: `LIBSVM`).                                                                                                                                                                                              |
 | `GammaCoefficientType` | `AUTOMATIC`, `SCALE`                                                             | The different modes for the dynamic gamma calculation (default: `AUTOMATIC`).                                                                                                                                                                               |
diff --git a/bindings/Python/solver_types.cpp b/bindings/Python/solver_types.cpp
index f8309fb4b..cb6ca843c 100644
--- a/bindings/Python/solver_types.cpp
+++ b/bindings/Python/solver_types.cpp
@@ -20,6 +20,7 @@ void init_solver_types(py::module_ &m) {
     py_enum
         .value("AUTOMATIC", plssvm::solver_type::automatic, "the default solver type; depends on the available device and system memory")
         .value("CG_EXPLICIT", plssvm::solver_type::cg_explicit, "explicitly assemble the kernel matrix on the device")
+        .value("CG_STREAMING", plssvm::solver_type::cg_streaming, "explicitly calculate the kernel matrix and fully store it on the host; realized using unified shared memory")
         .value("CG_IMPLICIT", plssvm::solver_type::cg_implicit, "implicitly calculate the kernel matrix entries in each CG iteration");
 
     // enable implicit conversion from string to enum

From 90f3f9b46fa280f75629ef4d8a039c182470bda1 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 20 Jul 2025 12:15:35 +0200
Subject: [PATCH 89/93] Improve AdaptiveCpp CMake warning for hierarchical and
 scoped kernels.

---
 src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt b/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
index 21bff471f..ae3e0b050 100644
--- a/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
+++ b/src/plssvm/backends/SYCL/AdaptiveCpp/CMakeLists.txt
@@ -20,9 +20,11 @@ if (PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP)
     set(ACPP_TARGETS "generic" CACHE STRING "" FORCE)
     if (PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS)
         message(
-            WARNING "Enabled SYCL's hierarchical and AdapitveCpp's scoped kernels in AdaptiveCpp while using its SSCP compilation flow. "
+            WARNING "Enabled SYCL's hierarchical and AdaptiveCpp's scoped kernels in AdaptiveCpp while using its SSCP compilation flow. "
                     "SSCP, however, does currently not implement these data parallel kernels resulting in a runtime exception. "
                     "If you wish to use them, set \"PLSSVM_SYCL_BACKEND_ADAPTIVECPP_USE_GENERIC_SSCP\" to \"OFF\" and use one of the legacy compilation flows. "
+                    "Alternatively, SYCL's hierarchical and AdaptiveCpp's scoped kernels can be disabled by setting "
+                    "\"PLSSVM_ENABLE_SYCL_HIERARCHICAL_AND_SCOPED_KERNELS\" to \"OFF\"."
         )
     endif ()
 else ()

From c64c9b4b81fb9471579648266b32c2abda485eb6 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 20 Jul 2025 12:16:06 +0200
Subject: [PATCH 90/93] Add missing cg_streaming case to the HPX switches.

---
 src/plssvm/backends/HPX/csvm.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/plssvm/backends/HPX/csvm.cpp b/src/plssvm/backends/HPX/csvm.cpp
index 4c24192dd..b3dd57a62 100644
--- a/src/plssvm/backends/HPX/csvm.cpp
+++ b/src/plssvm/backends/HPX/csvm.cpp
@@ -114,6 +114,7 @@ std::vector<::plssvm::detail::move_only_any> csvm::assemble_kernel_matrix(const
                     // unreachable
                     break;
                 case solver_type::cg_explicit:
+                case solver_type::cg_streaming:
                     {
                         // calculate the number of data points this device is responsible for
                         const std::size_t device_specific_num_rows = dist.place_specific_num_rows(0);
@@ -207,6 +208,7 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                     // unreachable
                     break;
                 case solver_type::cg_explicit:
+                case solver_type::cg_streaming:
                     {
                         const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::unique_ptr<real_type[]> &>(A.front());
                         PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!");

From d2b5af80978356c8bc16b2d58ce5ec66290e76e9 Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 20 Jul 2025 12:16:53 +0200
Subject: [PATCH 91/93] Fix wrong PLSSVM_ASSERT and update test case.

---
 src/plssvm/backends/HPX/csvm.cpp      |  2 +-
 src/plssvm/backends/OpenMP/csvm.cpp   |  2 +-
 src/plssvm/backends/stdpar/csvm.cpp   |  2 +-
 tests/backends/generic_csvm_tests.hpp | 27 +++++++++++++--------------
 4 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/plssvm/backends/HPX/csvm.cpp b/src/plssvm/backends/HPX/csvm.cpp
index b3dd57a62..24f1c3d70 100644
--- a/src/plssvm/backends/HPX/csvm.cpp
+++ b/src/plssvm/backends/HPX/csvm.cpp
@@ -211,7 +211,7 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
                 case solver_type::cg_streaming:
                     {
                         const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::unique_ptr<real_type[]> &>(A.front());
-                        PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!");
+                        PLSSVM_ASSERT(explicit_A != nullptr, "The A matrix must not be empty!");
 
                         const auto start = std::chrono::steady_clock::now();
 
diff --git a/src/plssvm/backends/OpenMP/csvm.cpp b/src/plssvm/backends/OpenMP/csvm.cpp
index 2df2cf393..a483b0636 100644
--- a/src/plssvm/backends/OpenMP/csvm.cpp
+++ b/src/plssvm/backends/OpenMP/csvm.cpp
@@ -213,7 +213,7 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
             case solver_type::cg_streaming:
                 {
                     const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::unique_ptr<real_type[]> &>(A.front());
-                    PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!");
+                    PLSSVM_ASSERT(explicit_A != nullptr, "The A matrix must not be empty!");
 
                     const auto start = std::chrono::steady_clock::now();
 
diff --git a/src/plssvm/backends/stdpar/csvm.cpp b/src/plssvm/backends/stdpar/csvm.cpp
index 45e5d6a94..a6f2c2d08 100644
--- a/src/plssvm/backends/stdpar/csvm.cpp
+++ b/src/plssvm/backends/stdpar/csvm.cpp
@@ -250,7 +250,7 @@ void csvm::blas_level_3(const solver_type solver, const real_type alpha, const s
             case solver_type::cg_streaming:
                 {
                     const auto &explicit_A = ::plssvm::detail::move_only_any_cast<const std::unique_ptr<real_type[]> &>(A.front());
-                    PLSSVM_ASSERT(!explicit_A.empty(), "The A matrix must not be empty!");
+                    PLSSVM_ASSERT(explicit_A != nullptr, "The A matrix must not be empty!");
 
                     const auto start = std::chrono::steady_clock::now();
 
diff --git a/tests/backends/generic_csvm_tests.hpp b/tests/backends/generic_csvm_tests.hpp
index ba90816aa..36c0d5669 100644
--- a/tests/backends/generic_csvm_tests.hpp
+++ b/tests/backends/generic_csvm_tests.hpp
@@ -439,45 +439,44 @@ TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, assemble_kernel_matrix_e
     const std::size_t row_offset = dist.place_row_offset(0);
 
     // helper lambda to reduce the amount of needed switches!
-    const auto run_assembly = [=](const plssvm::parameter &params_p, std::vector<plssvm::real_type> &kernel_matrix_p, const plssvm::soa_matrix<plssvm::real_type> &data_p, const std::size_t device_specific_num_rows_p, const std::size_t row_offset_p, const std::vector<plssvm::real_type> &q_red_p, const plssvm::real_type QA_cost_p) {
+    const auto run_assembly = [=](const plssvm::parameter &params_p, plssvm::real_type *kernel_matrix_p, const plssvm::soa_matrix<plssvm::real_type> &data_p, const std::size_t device_specific_num_rows_p, const std::size_t row_offset_p, const std::vector<plssvm::real_type> &q_red_p, const plssvm::real_type QA_cost_p) {
         switch (kernel) {
             case plssvm::kernel_function_type::linear:
-                device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost);
+                device_kernel_assembly<plssvm::kernel_function_type::linear>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost);
                 break;
             case plssvm::kernel_function_type::polynomial:
-                device_kernel_assembly<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly<plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, params_p.degree, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::rbf:
-                device_kernel_assembly<plssvm::kernel_function_type::rbf, plssvm::real_type>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::rbf, plssvm::real_type>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::sigmoid:
-                device_kernel_assembly<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
+                device_kernel_assembly<plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma), params_p.coef0);
                 break;
             case plssvm::kernel_function_type::laplacian:
-                device_kernel_assembly<plssvm::kernel_function_type::laplacian, plssvm::real_type>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::laplacian, plssvm::real_type>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
             case plssvm::kernel_function_type::chi_squared:
-                device_kernel_assembly<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(kernel_matrix_p.data(), data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
+                device_kernel_assembly<plssvm::kernel_function_type::chi_squared, plssvm::real_type>(kernel_matrix_p, data_p, device_specific_num_rows_p, row_offset_p, q_red_p, QA_cost_p, params_p.cost, std::get<plssvm::real_type>(params_p.gamma));
                 break;
         }
     };
 
     // check q_red size (must be equal to the number of data points - 1
-    EXPECT_DEATH(run_assembly(params, kernel_matrix, data.data(), device_specific_num_rows, row_offset, std::vector<plssvm::real_type>{}, QA_cost), fmt::format("Sizes mismatch!: 0 != {}", data.num_data_points() - 1));
+    EXPECT_DEATH(run_assembly(params, kernel_matrix.data(), data.data(), device_specific_num_rows, row_offset, std::vector<plssvm::real_type>{}, QA_cost), fmt::format("Sizes mismatch!: 0 != {}", data.num_data_points() - 1));
 
-    // check the kernel matrix size (depending on the usage of GEMM/SYMM)
-    std::vector<plssvm::real_type> ret;
-    EXPECT_DEATH(run_assembly(params, ret, data.data(), device_specific_num_rows, row_offset, q_red, QA_cost), "A matrix may not be empty!");
+    // the result kernel matrix must point to a valid chunk of memory
+    EXPECT_DEATH(run_assembly(params, nullptr, data.data(), device_specific_num_rows, row_offset, q_red, QA_cost), "The kernel matrix result pointer must be valid!");
 
     // check place specific number of rows
-    EXPECT_DEATH(run_assembly(params, kernel_matrix, data.data(), q_red.size() + 1, row_offset, q_red, QA_cost), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", q_red.size() + 1, q_red.size())));
+    EXPECT_DEATH(run_assembly(params, kernel_matrix.data(), data.data(), q_red.size() + 1, row_offset, q_red, QA_cost), ::testing::HasSubstr(fmt::format("The number of place specific rows ({}) cannot be greater the the total number of rows ({})!", q_red.size() + 1, q_red.size())));
 
     // check the row offset
-    EXPECT_DEATH(run_assembly(params, kernel_matrix, data.data(), device_specific_num_rows, q_red.size() + 1, q_red, QA_cost), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", q_red.size() + 1, q_red.size())));
+    EXPECT_DEATH(run_assembly(params, kernel_matrix.data(), data.data(), device_specific_num_rows, q_red.size() + 1, q_red, QA_cost), ::testing::HasSubstr(fmt::format("The row offset ({}) cannot be greater the the total number of rows ({})!", q_red.size() + 1, q_red.size())));
 
     // cost must not be 0.0 since 1.0 / cost is used
     params.cost = plssvm::real_type{ 0.0 };
-    EXPECT_DEATH(run_assembly(params, kernel_matrix, data.data(), device_specific_num_rows, row_offset, q_red, QA_cost), "cost must not be 0.0 since it is 1 / plssvm::cost!");
+    EXPECT_DEATH(run_assembly(params, kernel_matrix.data(), data.data(), device_specific_num_rows, row_offset, q_red, QA_cost), "cost must not be 0.0 since it is 1 / plssvm::cost!");
 }
 
 TYPED_TEST_P(GenericBackendCSVMKernelFunctionDeathTest, blas_level_3_kernel_implicit) {

From e7558c668c2bc1393ae431dc703eea87fcb9d7ba Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Sun, 20 Jul 2025 12:33:45 +0200
Subject: [PATCH 92/93] Implement basic Kokkos backend API changes for the
 cg_streaming solver type. Note: functionality currently not implemented!

---
 include/plssvm/backends/Kokkos/csvm.hpp       |  2 +-
 .../backends/Kokkos/detail/device_ptr.hpp     | 10 +++++--
 src/plssvm/backends/Kokkos/csvm.cpp           |  7 +++--
 .../backends/Kokkos/detail/device_ptr.cpp     | 16 ++++++----
 tests/backends/Kokkos/detail/device_ptr.cpp   | 29 +++++++++++++++++--
 5 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/include/plssvm/backends/Kokkos/csvm.hpp b/include/plssvm/backends/Kokkos/csvm.hpp
index 5a77ef1e1..3098a0d87 100644
--- a/include/plssvm/backends/Kokkos/csvm.hpp
+++ b/include/plssvm/backends/Kokkos/csvm.hpp
@@ -139,7 +139,7 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, detail::devic
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_assemble_kernel_matrix_explicit
      */
-    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
+    [[nodiscard]] device_ptr_type run_assemble_kernel_matrix_explicit(std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const final;
     /**
      * @copydoc plssvm::detail::gpu_csvm::run_blas_level_3_kernel_explicit
      */
diff --git a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
index ad067d00b..19cc9cb60 100644
--- a/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_ptr.hpp
@@ -34,6 +34,7 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, device_wrapper, de
     using base_type::data_;
     using base_type::queue_;
     using base_type::shape_;
+    using base_type::use_usm_allocations_;
 
   public:
     // Be able to use overloaded base class functions.
@@ -60,21 +61,24 @@ class device_ptr : public ::plssvm::detail::gpu_device_ptr<T, device_wrapper, de
      * @brief Allocates `size * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] size the number of elements represented by the device_ptr
      * @param[in] device the device wrapper
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    explicit device_ptr(size_type size, const device_wrapper &device);
+    explicit device_ptr(size_type size, const device_wrapper &device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `shape.x * shape.y * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] device the device wrapper
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    explicit device_ptr(plssvm::shape shape, const device_wrapper &device);
+    explicit device_ptr(plssvm::shape shape, const device_wrapper &device, bool use_usm_allocations = false);
     /**
      * @brief Allocates `(shape.x + padding.x) * (shape.y + padding.y) * sizeof(T)` bytes in the Kokkos execution space @p exec.
      * @param[in] shape the number of elements represented by the device_ptr
      * @param[in] padding the number of padding elements added to the extent values
      * @param[in] device the device wrapper
+     * @param[in] use_usm_allocations if `true` use USM allocations
      */
-    device_ptr(plssvm::shape shape, plssvm::shape padding, const device_wrapper &device);
+    device_ptr(plssvm::shape shape, plssvm::shape padding, const device_wrapper &device, bool use_usm_allocations = false);
 
     /**
      * @copydoc plssvm::detail::gpu_device_ptr::gpu_device_ptr(const plssvm::detail::gpu_device_ptr &)
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 3c10029b5..3081fe27e 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -505,7 +505,7 @@ ::plssvm::detail::dim_type csvm::get_max_grid_size([[maybe_unused]] const std::s
 //                        fit                        //
 //***************************************************//
 
-auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
+auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, const ::plssvm::detail::execution_range &exec, const parameter &params, const bool use_usm_allocations, const device_ptr_type &data_d, const device_ptr_type &q_red_d, real_type QA_cost) const -> device_ptr_type {
     const unsigned long long num_rows_reduced = data_d.shape().x - 1;
     const unsigned long long num_features = data_d.shape().y;
 
@@ -519,7 +519,10 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
     const ::plssvm::detail::triangular_data_distribution &dist = dynamic_cast<::plssvm::detail::triangular_data_distribution &>(*data_distribution_);
     const std::size_t num_entries_padded = dist.calculate_explicit_kernel_matrix_num_entries_padded(device_id);
 
-    device_ptr_type kernel_matrix_d{ num_entries_padded, devices_[device_id] };  // only explicitly store the upper triangular matrix
+    // only store the upper triangular matrix
+    // if solver == solver_type::cg_explicit: store it explicitly
+    // if solver == solver_type::cg_streaming: store it using USM
+    device_ptr_type kernel_matrix_d{ num_entries_padded, devices_[device_id], use_usm_allocations };
     const real_type cost_factor = real_type{ 1.0 } / params.cost;
     const std::size_t scratch_memory_size = static_cast<std::size_t>(2u * THREAD_BLOCK_SIZE * THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * sizeof(real_type);
 
diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index 0dfe9adc0..de0f34d06 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -36,16 +36,20 @@ template <typename T>
 using host_view_type = Kokkos::View<T *, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
 
 template <typename T>
-device_ptr<T>::device_ptr(const size_type size, const device_wrapper &device) :
-    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const size_type size, const device_wrapper &device, const bool use_usm_allocations) :
+    device_ptr{ plssvm::shape{ size, 1 }, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const device_wrapper &device) :
-    device_ptr{ shape, plssvm::shape{ 0, 0 }, device } { }
+device_ptr<T>::device_ptr(const plssvm::shape shape, const device_wrapper &device, const bool use_usm_allocations) :
+    device_ptr{ shape, plssvm::shape{ 0, 0 }, device, use_usm_allocations } { }
 
 template <typename T>
-device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const device_wrapper &device) :
-    base_type{ shape, padding, device } {
+device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const device_wrapper &device, const bool use_usm_allocations) :
+    base_type{ shape, padding, device, use_usm_allocations } {
+    if (use_usm_allocations_) {
+        // TODO: implement
+        throw backend_exception{ "Not implemented yet!" };
+    }
     data_ = make_device_view_wrapper<T *>(device, this->size_padded());
     this->memset(0);
 }
diff --git a/tests/backends/Kokkos/detail/device_ptr.cpp b/tests/backends/Kokkos/detail/device_ptr.cpp
index ec525dad5..b3125979a 100644
--- a/tests/backends/Kokkos/detail/device_ptr.cpp
+++ b/tests/backends/Kokkos/detail/device_ptr.cpp
@@ -24,10 +24,11 @@
 
 #include <tuple>  // std::tuple
 
-template <typename T, plssvm::kokkos::execution_space exec_space>
+template <typename T, bool UAA, plssvm::kokkos::execution_space exec_space>
 struct kokkos_device_ptr_test_type {
     using device_ptr_type = plssvm::kokkos::detail::device_ptr<T>;
     using queue_type = plssvm::kokkos::detail::device_wrapper;
+    constexpr static bool use_usm_allocations = UAA;
     constexpr static plssvm::kokkos::execution_space space = exec_space;
 
     static const queue_type &default_queue() {
@@ -37,9 +38,9 @@ struct kokkos_device_ptr_test_type {
 };
 
 template <plssvm::kokkos::execution_space space>
-using kokkos_device_ptr_test_type_float = kokkos_device_ptr_test_type<float, space>;
+using kokkos_device_ptr_test_type_float = kokkos_device_ptr_test_type<float, false, space>;
 template <plssvm::kokkos::execution_space space>
-using kokkos_device_ptr_test_type_double = kokkos_device_ptr_test_type<double, space>;
+using kokkos_device_ptr_test_type_double = kokkos_device_ptr_test_type<double, false, space>;
 
 using kokkos_device_ptr_tuple = util::detail::concat_tuple_types_t<util::create_kokkos_test_tuple_t<kokkos_device_ptr_test_type_float>,
                                                                    util::create_kokkos_test_tuple_t<kokkos_device_ptr_test_type_double>>;
@@ -53,3 +54,25 @@ INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtr, DevicePtr, kokkos_device_ptr_typ
 INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtr, DevicePtrLayout, kokkos_device_ptr_layout_type_gtest, naming::test_parameter_to_name);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtrDeathTest, DevicePtrDeathTest, kokkos_device_ptr_type_gtest, naming::test_parameter_to_name);
+
+//
+// test USM pointer
+//
+
+template <plssvm::kokkos::execution_space space>
+using kokkos_usm_device_ptr_test_type_float = kokkos_device_ptr_test_type<float, true, space>;
+template <plssvm::kokkos::execution_space space>
+using kokkos_usm_device_ptr_test_type_double = kokkos_device_ptr_test_type<double, true, space>;
+
+using kokkos_device_ptr_usm_tuple = util::detail::concat_tuple_types_t<util::create_kokkos_test_tuple_t<kokkos_usm_device_ptr_test_type_float>,
+                                                                       util::create_kokkos_test_tuple_t<kokkos_usm_device_ptr_test_type_double>>;
+
+// the tests used in the instantiated GTest test suites
+using kokkos_device_ptr_usm_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<kokkos_device_ptr_usm_tuple>>;
+using kokkos_device_ptr_usm_layout_type_gtest = util::combine_test_parameters_gtest_t<util::cartesian_type_product_t<kokkos_device_ptr_usm_tuple>, util::layout_type_list>;
+
+// instantiate type-parameterized tests
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtrUSM, DevicePtr, kokkos_device_ptr_usm_type_gtest, naming::test_parameter_to_name);
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtrUSM, DevicePtrLayout, kokkos_device_ptr_usm_layout_type_gtest, naming::test_parameter_to_name);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(KokkosDevicePtrUSMDeathTest, DevicePtrDeathTest, kokkos_device_ptr_usm_type_gtest, naming::test_parameter_to_name);

From e70a42b28840cf6934306fc7ee0a5e3653fdd10d Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Mon, 21 Jul 2025 19:53:35 +0200
Subject: [PATCH 93/93] Add USM (and therefore cg_streaming) support to the
 Kokkos backend.

---
 .../constexpr_available_memory_spaces.hpp     |  48 ++++
 .../Kokkos/detail/device_view_wrapper.hpp     |  81 ++++--
 .../Kokkos/kernel/cg_explicit/blas.hpp        |  37 ++-
 .../cg_explicit/kernel_matrix_assembly.hpp    |  15 +-
 .../kernel_matrix_assembly_blas.hpp           |   5 +-
 .../backends/Kokkos/kernel/predict_kernel.hpp |  21 +-
 .../plssvm/backends/Kokkos/memory_space.hpp   |  77 +++++
 .../Kokkos/memory_space_type_traits.hpp       | 265 ++++++++++++++++++
 src/plssvm/backends/Kokkos/CMakeLists.txt     |   1 +
 src/plssvm/backends/Kokkos/csvm.cpp           |  73 +++--
 .../backends/Kokkos/detail/device_ptr.cpp     |   6 +-
 src/plssvm/backends/Kokkos/memory_space.cpp   |  74 +++++
 tests/backends/Kokkos/CMakeLists.txt          |   2 +
 .../Kokkos/detail/device_view_wrapper.cpp     | 125 +++++++--
 tests/backends/Kokkos/memory_space.cpp        |  71 +++++
 .../Kokkos/memory_space_type_traits.cpp       | 110 ++++++++
 16 files changed, 898 insertions(+), 113 deletions(-)
 create mode 100644 include/plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp
 create mode 100644 include/plssvm/backends/Kokkos/memory_space.hpp
 create mode 100644 include/plssvm/backends/Kokkos/memory_space_type_traits.hpp
 create mode 100644 src/plssvm/backends/Kokkos/memory_space.cpp
 create mode 100644 tests/backends/Kokkos/memory_space.cpp
 create mode 100644 tests/backends/Kokkos/memory_space_type_traits.cpp

diff --git a/include/plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp b/include/plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp
new file mode 100644
index 000000000..d023c6651
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp
@@ -0,0 +1,48 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Function to list all available memory spaces at compile time.
+ * @note Must be a separate file such that the Kokkos header must not be included in the "execution_space.hpp" file.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_MEMORY_SPACES_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_MEMORY_SPACES_HPP_
+
+#include "plssvm/backends/Kokkos/memory_space.hpp"  // plssvm::kokkos::memory_space
+
+#include <array>  // std::array
+
+namespace plssvm::kokkos::detail {
+
+/**
+ * @brief List all available Kokkos::MemorySpaces at compile time.
+ * @details The `memory_space::host_space` is always available!
+ * @return a `std::array` containing all available memory spaces (`[[nodiscard]]`)
+ */
+[[nodiscard]] inline constexpr auto constexpr_available_memory_spaces() noexcept {
+    // Note: the trailing comma is explicitly allowed by the standard
+    return std::array{
+        memory_space::host_space,
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_CUDA)
+        memory_space::cuda_space,
+        memory_space::cuda_usm_space,
+#endif
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_HIP)
+        memory_space::hip_space,
+        memory_space::hip_usm_space,
+#endif
+#if defined(PLSSVM_KOKKOS_BACKEND_ENABLE_SYCL)
+        memory_space::sycl_space,
+        memory_space::sycl_usm_space,
+#endif
+    };
+}
+
+}  // namespace plssvm::kokkos::detail
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_DETAIL_CONSTEXPR_AVAILABLE_MEMORY_SPACES_HPP_
diff --git a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
index ea60bb1fd..51c739585 100644
--- a/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
+++ b/include/plssvm/backends/Kokkos/detail/device_view_wrapper.hpp
@@ -12,13 +12,14 @@
 #ifndef PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
 #define PLSSVM_BACKENDS_KOKKOS_DETAIL_DEVICE_VIEW_WRAPPER_HPP_
 
-#include "plssvm/backends/Kokkos/detail/constexpr_available_execution_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_execution_spaces
-#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"                        // plssvm::kokkos::detail::device_wrapper
-#include "plssvm/backends/Kokkos/execution_space.hpp"                              // plssvm::kokkos::execution_space
-#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp"                  // plssvm::kokkos::execution_space_to_kokkos_type_t
-#include "plssvm/detail/type_traits.hpp"                                           // plssvm::detail::remove_cvref_t
+#include "plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_memory_spaces
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"                     // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/execution_space.hpp"                           // plssvm::kokkos::execution_space
+#include "plssvm/backends/Kokkos/memory_space.hpp"                              // plssvm::kokkos::memory_space
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"                  // plssvm::kokkos::{memory_space_to_kokkos_type_t, kokkos_execution_space_to_kokkos_memory_space_t}
+#include "plssvm/detail/type_traits.hpp"                                        // plssvm::detail::remove_cvref_t
 
-#include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::ExecutionSpace
+#include "Kokkos_Core.hpp"  // Kokkos::View, Kokkos::MemorySpace
 
 #include <array>       // std::array
 #include <cstddef>     // std::size_t
@@ -38,27 +39,27 @@ struct create_view_variant_type_helper;
 
 /**
  * @brief Helper struct to create a `std::variant` containing all available Kokkos::View types by iterating over the `std::array` of
- *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ *        `plssvm::kokkos::memory_space` values as returned by `plssvm::kokkos::detail::constexpr_available_memory_spaces()`.
  * @tparam T the value type of the underlying Kokkos::View
  * @tparam Is the indices to index the `std::array`
  */
 template <typename T, std::size_t... Is>
 struct create_view_variant_type_helper<T, std::index_sequence<Is...>> {
-    /// The array containing all available execution spaces.
-    constexpr static auto array = detail::constexpr_available_execution_spaces();
+    /// The array containing all available memory spaces.
+    constexpr static auto array = detail::constexpr_available_memory_spaces();
     /// The resulting variant type.
-    using type = std::variant<Kokkos::View<T, execution_space_to_kokkos_type_t<array[Is]>>...>;
+    using type = std::variant<Kokkos::View<T, memory_space_to_kokkos_type_t<array[Is]>>...>;
 };
 
 /**
  * @brief Create a `std::variant` containing all available Kokkos::View types by iterating over the `std::array` of
- *        `plssvm::kokkos::execution_space` values as returned by `plssvm::kokkos::detail::constexpr_available_execution_spaces()`.
+ *        `plssvm::kokkos::memory_space` values as returned by `plssvm::kokkos::detail::constexpr_available_memory_spaces()`.
  * @tparam T the value type of the underlying Kokkos::View
  */
 template <typename T>
 struct create_view_variant_type {
     /// The number of types in the final variant.
-    constexpr static std::size_t N = detail::constexpr_available_execution_spaces().size();
+    constexpr static std::size_t N = detail::constexpr_available_memory_spaces().size();
     /// The final variant type.
     using type = typename create_view_variant_type_helper<T, std::make_index_sequence<N>>::type;
 };
@@ -82,37 +83,49 @@ class device_view_wrapper {
 
     /**
      * @brief Construct the wrapper using the provided Kokkos::View instance by forwarding its value to the underlying `std::variant`.
-     * @tparam ExecutionSpace the used Kokkos::ExecutionSpace type of the Kokkos::View
+     * @tparam MemorySpace the used Kokkos::MemorySpace type of the Kokkos::View
      * @param[in] view the Kokkos::View instance
      */
-    template <typename ExecutionSpace>
-    explicit device_view_wrapper(Kokkos::View<T, ExecutionSpace> &&view) :
-        v_{ std::move(view) } { }
+    template <typename MemorySpace>
+    explicit device_view_wrapper(Kokkos::View<T, MemorySpace> &&view, const bool use_usm_allocations = false) :
+        v_{ std::move(view) },
+        use_usm_allocations_{ use_usm_allocations } { }
 
     /**
      * @brief Given the provided `execution_space` enum value, tries to get the `std::variant` alternative for the corresponding Kokkos::ExecutionSpace type.
      * @tparam space the `execution_space` enum value
+     * @tparam use_usm_allocations if `true` use USM allocations
      * @return the Kokkos::View instance (`[[nodiscard]]`)
      */
-    template <execution_space space>
-    [[nodiscard]] Kokkos::View<T, execution_space_to_kokkos_type_t<space>> &get() {
-        return std::get<Kokkos::View<T, execution_space_to_kokkos_type_t<space>>>(v_);
+    template <execution_space space, bool use_usm_allocations = false>
+    [[nodiscard]] auto &get() {
+        constexpr memory_space mem_space = execution_space_to_memory_space_v<space, use_usm_allocations>;
+        return std::get<Kokkos::View<T, memory_space_to_kokkos_type_t<mem_space>>>(v_);
     }
 
     /**
      * @copydoc plssvm::kokkos::detail::device_view_wrapper::get
      */
-    template <execution_space space>
-    [[nodiscard]] const Kokkos::View<T, execution_space_to_kokkos_type_t<space>> &get() const {
-        return std::get<Kokkos::View<T, execution_space_to_kokkos_type_t<space>>>(v_);
+    template <execution_space space, bool use_usm_allocations = false>
+    [[nodiscard]] const auto &get() const {
+        constexpr memory_space mem_space = execution_space_to_memory_space_v<space, use_usm_allocations>;
+        return std::get<Kokkos::View<T, memory_space_to_kokkos_type_t<mem_space>>>(v_);
     }
 
     /**
-     * @brief Return the `execution_space` enum value of the currently active `std::variant` Kokkos::View type.
-     * @return the `execution_space` enum value (`[[nodiscard]]`)
+     * @brief Return the `memory_space` enum value of the currently active `std::variant` Kokkos::View type.
+     * @return the `memory_space` enum value (`[[nodiscard]]`)
      */
-    [[nodiscard]] execution_space get_execution_space() const noexcept {
-        return detail::constexpr_available_execution_spaces()[v_.index()];
+    [[nodiscard]] constexpr memory_space get_memory_space() const noexcept {
+        return detail::constexpr_available_memory_spaces()[v_.index()];
+    }
+
+    /**
+     * @brief Check whether USM allocations are used.
+     * @return `true` if USM allocations are used, `false` otherwise (`[[nodiscard]]`)
+     */
+    [[nodiscard]] bool uses_usm_allocations() const noexcept {
+        return use_usm_allocations_;
     }
 
     /**
@@ -164,6 +177,8 @@ class device_view_wrapper {
   private:
     /// The wrapped `std::variant` type.
     variant_type v_;
+    /// `true` if USM allocations and, therefore, other Kokkos::MemorySpaces, are used.
+    bool use_usm_allocations_;
 };
 
 /**
@@ -171,14 +186,20 @@ class device_view_wrapper {
  * @tparam T the value type of the underlying Kokkos::View
  * @param[in] device the device for which this view should be allocated
  * @param[in] size the size of the Kokkos::View (number of elements **not** byte!)
- * @return a Kokkos::View wrapper where the active member of the internal `std::variant` corresponds to the Kokkos::View in the Kokkos::ExecutionSpace specified by @p space (`[[nodiscard]]`)
+ * @param[in] use_usm_allocations decide whether a USM memory space should be used or not
+ * @return a Kokkos::View wrapper where the active member of the internal `std::variant` corresponds to the Kokkos::View in the Kokkos::MemorySpace based on the requested Kokkos::ExecutionSpace and @p use_usm_allocations (`[[nodiscard]]`)
  */
 template <typename T>
-[[nodiscard]] device_view_wrapper<T> make_device_view_wrapper(const device_wrapper &device, const std::size_t size) {
+[[nodiscard]] device_view_wrapper<T> make_device_view_wrapper(const device_wrapper &device, const std::size_t size, const bool use_usm_allocations) {
     return device.execute_and_return([&](const auto &value) {
+        // get the Kokkos execution space
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(value)>;
-
-        return device_view_wrapper{ Kokkos::View<T, kokkos_execution_space_type>{ Kokkos::view_alloc(value, "device_ptr_view"), size } };
+        // check whether we want to use USM allocations or not
+        if (use_usm_allocations) {
+            return device_view_wrapper{ Kokkos::View<T, kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, true>>{ Kokkos::view_alloc(value, "usm_device_ptr_view"), size }, use_usm_allocations };
+        } else {
+            return device_view_wrapper{ Kokkos::View<T, kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, false>>{ Kokkos::view_alloc(value, "device_ptr_view"), size }, use_usm_allocations };
+        }
     });
 }
 
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
index c024a1362..1cff7f721 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/blas.hpp
@@ -13,8 +13,9 @@
 #define PLSSVM_BACKENDS_KOKKOS_CG_EXPLICIT_BLAS_HPP_
 #pragma once
 
-#include "plssvm/constants.hpp"         // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
-#include "plssvm/target_platforms.hpp"  // plssvm::target_platform
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"  // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t
+#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/target_platforms.hpp"                          // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents
 
@@ -25,15 +26,21 @@ namespace plssvm::kokkos::detail {
 /**
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam USMEnabledMemorySpace the Kokkos::MemorySpace that may use USM allocations
  * @tparam target the target platform
  */
-template <typename ExecutionSpace, target_platform target>
+template <typename ExecutionSpace, typename USMEnabledMemorySpace, target_platform target>
 class device_kernel_symm {
+    /**
+     * @brief The type of the used Kokkos::View that may use USM allocations.
+     */
+    template <typename T>
+    using usm_device_view_type = Kokkos::View<T *, USMEnabledMemorySpace>;  // possible USM allocations
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
@@ -51,7 +58,7 @@ class device_kernel_symm {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
-    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+    device_kernel_symm(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, usm_device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         device_num_rows_{ device_num_rows },
@@ -173,7 +180,7 @@ class device_kernel_symm {
     const std::size_t device_num_rows_;
     const std::size_t device_row_offset_;
     const real_type alpha_;
-    device_view_type<const real_type> A_;
+    usm_device_view_type<const real_type> A_;
     device_view_type<const real_type> B_;
     const real_type beta_;
     device_view_type<real_type> C_;
@@ -187,15 +194,21 @@ class device_kernel_symm {
  * @brief Perform an explicit BLAS SYMM operation: `C = alpha * A * B + beta * C` where @p A is a `m x k` symmetric matrix (memory optimized), @p B is a `k x n` matrix, @p C is a `m x n` matrix, and @p alpha and @p beta are scalars.
  * @details In a multi-GPU setting, this function is responsible for mirroring down the columns this device is responsible for!
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam USMEnabledMemorySpace the Kokkos::MemorySpace that may use USM allocations
  * @tparam target the target platform
  */
-template <typename ExecutionSpace, target_platform target>
+template <typename ExecutionSpace, typename USMEnabledMemorySpace, target_platform target>
 class device_kernel_symm_mirror {
+    /**
+     * @brief The type of the used Kokkos::View that may use USM allocations.
+     */
+    template <typename T>
+    using usm_device_view_type = Kokkos::View<T *, USMEnabledMemorySpace>;  // possible USM allocations
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
@@ -214,7 +227,7 @@ class device_kernel_symm_mirror {
      * @param[in] grid_y_offset the offset in y-dimension into the data points if more than one execution grid has to be used
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      */
-    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
+    device_kernel_symm_mirror(const std::size_t num_rows, const std::size_t num_rhs, const std::size_t num_mirror_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const real_type alpha, usm_device_view_type<const real_type> A, device_view_type<const real_type> B, const real_type beta, device_view_type<real_type> C, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x) :
         num_rows_{ num_rows },
         num_rhs_{ num_rhs },
         num_mirror_rows_{ num_mirror_rows },
@@ -332,7 +345,7 @@ class device_kernel_symm_mirror {
     const std::size_t device_num_rows_;
     const std::size_t device_row_offset_;
     const real_type alpha_;
-    device_view_type<const real_type> A_;
+    usm_device_view_type<const real_type> A_;
     device_view_type<const real_type> B_;
     const real_type beta_;
     device_view_type<real_type> C_;
@@ -352,7 +365,7 @@ class device_kernel_inplace_matrix_add {
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
@@ -426,7 +439,7 @@ class device_kernel_inplace_matrix_scale {
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
index 8274c4d31..8daeb8a26 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_explicit/kernel_matrix_assembly.hpp
@@ -15,6 +15,7 @@
 
 #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"      // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t
 #include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
@@ -28,17 +29,23 @@ namespace plssvm::kokkos::detail {
 /**
  * @brief Create the explicit kernel matrix using the @p kernel_function.
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
+ * @tparam USMEnabledMemorySpace the Kokkos::MemorySpace that may use USM allocations
  * @tparam target the target platform
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function; stored in a `standard_layout_tuple`
  */
-template <typename ExecutionSpace, target_platform target, kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, typename USMEnabledMemorySpace, target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly {
+    /**
+     * @brief The type of the used Kokkos::View that may use USM allocations.
+     */
+    template <typename T>
+    using usm_device_view_type = Kokkos::View<T *, USMEnabledMemorySpace>;  // possible USM allocations
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
@@ -57,7 +64,7 @@ class device_kernel_assembly {
      * @param[in] grid_size_x the size of the execution grid in x-dimension
      * @param[in] kernel_function_parameter the parameters necessary to apply the @p kernel_function
      */
-    device_kernel_assembly(device_view_type<real_type> kernel_matrix, device_view_type<real_type> data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, device_view_type<real_type> q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
+    device_kernel_assembly(usm_device_view_type<real_type> kernel_matrix, device_view_type<real_type> data, const std::size_t num_rows, const std::size_t device_num_rows, const std::size_t device_row_offset, const std::size_t num_features, device_view_type<real_type> q, const real_type QA_cost, const real_type cost, const std::size_t grid_x_offset, const std::size_t grid_y_offset, const std::size_t grid_size_x, Args... kernel_function_parameter) :
         kernel_matrix_{ kernel_matrix },
         data_{ data },
         num_rows_{ num_rows },
@@ -184,7 +191,7 @@ class device_kernel_assembly {
 
   private:
     /// @cond Doxygen_suppress
-    device_view_type<real_type> kernel_matrix_;
+    usm_device_view_type<real_type> kernel_matrix_;
     device_view_type<const real_type> data_;
     const std::size_t num_rows_;
     const std::size_t device_num_rows_;
diff --git a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
index 15c9239be..ad1c6536a 100644
--- a/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/cg_implicit/kernel_matrix_assembly_blas.hpp
@@ -15,6 +15,7 @@
 
 #include "plssvm/backends/Kokkos/detail/standard_layout_tuple.hpp"  // plssvm::kokkos::detail::standard_layout_tuple
 #include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"       // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"      // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t
 #include "plssvm/constants.hpp"                                     // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
 #include "plssvm/kernel_function_types.hpp"                         // plssvm::kernel_function_type
 #include "plssvm/target_platforms.hpp"                              // plssvm::target_platform
@@ -32,13 +33,13 @@ namespace plssvm::kokkos::detail {
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <typename ExecutionSpace, target_platform target, kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, typename, target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_assembly_symm {
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
diff --git a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
index 0bb5ecf6b..51e67a89e 100644
--- a/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
+++ b/include/plssvm/backends/Kokkos/kernel/predict_kernel.hpp
@@ -13,10 +13,11 @@
 #define PLSSVM_BACKENDS_KOKKOS_PREDICT_KERNEL_HPP_
 #pragma once
 
-#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"  // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
-#include "plssvm/constants.hpp"                                // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
-#include "plssvm/kernel_function_types.hpp"                    // plssvm::kernel_function_type
-#include "plssvm/target_platforms.hpp"                         // plssvm::target_platform
+#include "plssvm/backends/Kokkos/kernel/kernel_functions.hpp"   // plssvm::kokkos::detail::{feature_reduce, apply_kernel_function}
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"  // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t
+#include "plssvm/constants.hpp"                                 // plssvm::{real_type, THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE, PADDING_SIZE}
+#include "plssvm/kernel_function_types.hpp"                     // plssvm::kernel_function_type
+#include "plssvm/target_platforms.hpp"                          // plssvm::target_platform
 
 #include "Kokkos_Core.hpp"  // KOKKOS_INLINE_FUNCTION, Kokkos::View, Kokkos::TeamPolicy, Kokkos::mdspan, Kokkos::dextents, Kokkos::atomic_add
 
@@ -29,13 +30,13 @@ namespace plssvm::kokkos::detail {
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  * @tparam target the target platform
  */
-template <typename ExecutionSpace, target_platform target>
+template <typename ExecutionSpace, typename, target_platform target>
 class device_kernel_w_linear {
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
@@ -174,13 +175,13 @@ class device_kernel_w_linear {
  * @tparam ExecutionSpace the Kokkos::ExecutionSpace used to execute the kernel
  * @tparam target the target platform
  */
-template <typename ExecutionSpace, target_platform target>
+template <typename ExecutionSpace, typename, target_platform target>
 class device_kernel_predict_linear {
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
@@ -321,13 +322,13 @@ class device_kernel_predict_linear {
  * @tparam kernel_function the type of the used kernel function
  * @tparam Args the types of the parameters necessary for the specific kernel function
  */
-template <typename ExecutionSpace, target_platform target, kernel_function_type kernel_function, typename... Args>
+template <typename ExecutionSpace, typename, target_platform target, kernel_function_type kernel_function, typename... Args>
 class device_kernel_predict {
     /**
      * @brief The type of the used Kokkos::View.
      */
     template <typename T>
-    using device_view_type = Kokkos::View<T *, ExecutionSpace>;
+    using device_view_type = Kokkos::View<T *, kokkos_execution_space_to_kokkos_memory_space_t<ExecutionSpace, false>>;  // no USM allocations
 
   public:
     /**
diff --git a/include/plssvm/backends/Kokkos/memory_space.hpp b/include/plssvm/backends/Kokkos/memory_space.hpp
new file mode 100644
index 000000000..eba6e1674
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/memory_space.hpp
@@ -0,0 +1,77 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Memory space enumeration for the MemorySpaces in Kokkos.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_HPP_
+#pragma once
+
+#include "fmt/base.h"     // fmt::formatter
+#include "fmt/ostream.h"  // fmt::ostream_formatter
+
+#include <iosfwd>  // std::ostream forward declaration
+#include <vector>  // std::vector
+
+namespace plssvm::kokkos {
+
+/**
+ * @brief Enum class for all memory spaces supported by [Kokkos](https://github.com/kokkos/kokkos).
+ */
+enum class memory_space {
+    /** Memory space representing traditional memory accessible from the CPU. */
+    host_space,
+    /** Memory space representing memory on a CUDA-capable GPU. */
+    cuda_space,
+    /** Memory space representing unified virtual memory on a CUDA-capable GPU system. */
+    cuda_usm_space,
+    /** Memory space representing memory in the HIP GPU programming environment. */
+    hip_space,
+    /** Memory space representing page-migrating memory in the HIP GPU programming environment. */
+    hip_usm_space,
+    /** Memory space representing device memory in the SYCL GPU programming environment. */
+    sycl_space,
+    /** Memory space representing page-migrating memory in the SYCL GPU programming environment */
+    sycl_usm_space
+};
+
+/**
+ * @brief Output the memory @p space to the given output-stream @p out.
+ * @param[in,out] out the output-stream to write the memory space to
+ * @param[in] space the Kokkos memory space
+ * @return the output-stream
+ */
+std::ostream &operator<<(std::ostream &out, memory_space space);
+
+/**
+ * @brief Use the input-stream @p in to initialize the memory @p space.
+ * @param[in,out] in input-stream to extract the memory space from
+ * @param[in] space the Kokkos memory space
+ * @return the input-stream
+ */
+std::istream &operator>>(std::istream &in, memory_space &space);
+
+/**
+ * @brief List all available Kokkos::MemorySpaces.
+ * @details Only Kokkos::MemorySpaces that where enabled during the CMake configuration are available.
+ *          The `memory_space::host_space` is always included.
+ * @return the available Kokkos::MemorySpaces (`[[nodiscard]]`)
+ */
+[[nodiscard]] std::vector<memory_space> list_available_memory_spaces();
+
+}  // namespace plssvm::kokkos
+
+/// @cond
+
+template <>
+struct fmt::formatter<plssvm::kokkos::memory_space> : fmt::ostream_formatter { };
+
+/// @endcond
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_HPP_
diff --git a/include/plssvm/backends/Kokkos/memory_space_type_traits.hpp b/include/plssvm/backends/Kokkos/memory_space_type_traits.hpp
new file mode 100644
index 000000000..a25a846ab
--- /dev/null
+++ b/include/plssvm/backends/Kokkos/memory_space_type_traits.hpp
@@ -0,0 +1,265 @@
+/**
+ * @file
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Memory space type traits for the MemorySpaces in Kokkos.
+ */
+
+#ifndef PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_TYPE_TRAITS_HPP_
+#define PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_TYPE_TRAITS_HPP_
+#include <decl/Kokkos_Declare_OPENMP.hpp>
+#pragma once
+
+#include "plssvm/backends/Kokkos/execution_space.hpp"              // plssvm::kokkos::execution_space
+#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp"  // plssvm::kokkos::kokkos_type_to_execution_space_v
+#include "plssvm/backends/Kokkos/memory_space.hpp"                 // plssvm::kokkos::memory_space
+
+#include "Kokkos_Core.hpp"  // Kokkos macros, Kokkos MemorySpace types
+
+namespace plssvm::kokkos {
+
+//***************************************************//
+//            memory_space_to_kokkos_type            //
+//***************************************************//
+
+/**
+ * @brief Uninstantiated base type to convert a `memory_space` enum value to a Kokkos::MemorySpace type.
+ */
+template <memory_space>
+struct memory_space_to_kokkos_type;
+
+/**
+ * @brief Convert a `memory_space::host_space` enum value to a `Kokkos::HostSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::host_space> {
+    using type = Kokkos::HostSpace;
+};
+
+#if defined(KOKKOS_ENABLE_CUDA)
+/**
+ * @brief Convert a `memory_space::cuda_space` enum value to a `Kokkos::CudaSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::cuda_space> {
+    using type = Kokkos::CudaSpace;
+};
+
+/**
+ * @brief Convert a `memory_space::cuda_usm_space` enum value to a `Kokkos::CudaUVMSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::cuda_usm_space> {
+    using type = Kokkos::CudaUVMSpace;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+/**
+ * @brief Convert a `memory_space::hip_space` enum value to a `Kokkos::HIPSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::hip_space> {
+    using type = Kokkos::HIPSpace;
+};
+
+/**
+ * @brief Convert a `memory_space::hip_usm_space` enum value to a `Kokkos::HIPManagedSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::hip_usm_space> {
+    using type = Kokkos::HIPManagedSpace;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+/**
+ * @brief Convert a `memory_space::sycl_space` enum value to a `Kokkos::SYCLDeviceUSMSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::sycl_space> {
+    using type = Kokkos::SYCLDeviceUSMSpace;
+};
+
+/**
+ * @brief Convert a `memory_space::sycl_usm_space` enum value to a `Kokkos::SYCLSharedUSMSpace` Kokkos::MemorySpace type.
+ */
+template <>
+struct memory_space_to_kokkos_type<memory_space::sycl_usm_space> {
+    using type = Kokkos::SYCLSharedUSMSpace;
+};
+#endif
+
+/**
+ * @brief Convert the `memory_space` @p space to the corresponding Kokkos::MemorySpace type.
+ * @tparam space the enum value to convert
+ */
+template <memory_space space>
+using memory_space_to_kokkos_type_t = typename memory_space_to_kokkos_type<space>::type;
+
+//***************************************************//
+//            kokkos_type_to_memory_space            //
+//***************************************************//
+
+/**
+ * @brief Uninstantiated base type to convert a Kokkos::MemorySpace type to a `memory_space` enum value.
+ */
+template <typename>
+struct kokkos_type_to_memory_space;
+
+/**
+ * @brief Convert a `Kokkos::HostSpace` Kokkos::MemorySpace type to a `memory_space::host_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::HostSpace> {
+    constexpr static memory_space value = memory_space::host_space;
+};
+
+#if defined(KOKKOS_ENABLE_CUDA)
+/**
+ * @brief Convert a `Kokkos::CudaSpace` Kokkos::MemorySpace type to a `memory_space::cuda_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::CudaSpace> {
+    constexpr static memory_space value = memory_space::cuda_space;
+};
+
+/**
+ * @brief Convert a `Kokkos::CudaUVMSpace` Kokkos::MemorySpace type to a `memory_space::cuda_usm_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::CudaUVMSpace> {
+    constexpr static memory_space value = memory_space::cuda_usm_space;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+/**
+ * @brief Convert a `Kokkos::HIPSpace` Kokkos::MemorySpace type to a `memory_space::hip_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::HIPSpace> {
+    constexpr static memory_space value = memory_space::hip_space;
+};
+
+/**
+ * @brief Convert a `Kokkos::HIPManagedSpace` Kokkos::MemorySpace type to a `memory_space::hip_usm_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::HIPManagedSpace> {
+    constexpr static memory_space value = memory_space::hip_usm_space;
+};
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+/**
+ * @brief Convert a `Kokkos::SYCLDeviceUSMSpace` Kokkos::MemorySpace type to a `memory_space::sycl_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::SYCLDeviceUSMSpace> {
+    constexpr static memory_space value = memory_space::sycl_space;
+};
+
+/**
+ * @brief Convert a `Kokkos::SYCLSharedUSMSpace` Kokkos::MemorySpace type to a `memory_space::sycl_usm_space` enum value.
+ */
+template <>
+struct kokkos_type_to_memory_space<Kokkos::SYCLSharedUSMSpace> {
+    constexpr static memory_space value = memory_space::sycl_usm_space;
+};
+#endif
+
+/**
+ * @brief Convert the Kokkos::MemorySpace type @p MemorySpace to the corresponding `memory_space` enum value.
+ * @tparam MemorySpace the Kokkos::MemorySpace type to convert
+ */
+template <typename MemorySpace>
+inline constexpr memory_space kokkos_type_to_memory_space_v = kokkos_type_to_memory_space<MemorySpace>::value;
+
+//***************************************************//
+//          execution_space_to_memory_space          //
+//***************************************************//
+
+/**
+ * @brief Convert a host `execution_space` enum value to a `memory_space::host_space` enum value.
+ */
+template <execution_space, bool UseUSM>
+struct execution_space_to_memory_space {
+    constexpr static memory_space value = memory_space::host_space;
+};
+
+/**
+ * @brief Convert an `execution_space::cuda` that does not use USM allocations enum value to a `memory_space::cuda_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::cuda, false> {
+    constexpr static memory_space value = memory_space::cuda_space;
+};
+
+/**
+ * @brief Convert an `execution_space::cuda` that does use USM allocations enum value to a `memory_space::cuda_usm_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::cuda, true> {
+    constexpr static memory_space value = memory_space::cuda_usm_space;
+};
+
+/**
+ * @brief Convert an `execution_space::hip` that does not use USM allocations enum value to a `memory_space::hip_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::hip, false> {
+    constexpr static memory_space value = memory_space::hip_space;
+};
+
+/**
+ * @brief Convert an `execution_space::hip` that does use USM allocations enum value to a `memory_space::hip_usm_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::hip, true> {
+    constexpr static memory_space value = memory_space::hip_usm_space;
+};
+
+/**
+ * @brief Convert an `execution_space::sycl` that does not use USM allocations enum value to a `memory_space::sycl_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::sycl, false> {
+    constexpr static memory_space value = memory_space::sycl_space;
+};
+
+/**
+ * @brief Convert an `execution_space::sycl` that does use USM allocations enum value to a `memory_space::sycl_usm_space` enum value.
+ */
+template <>
+struct execution_space_to_memory_space<execution_space::sycl, true> {
+    constexpr static memory_space value = memory_space::sycl_usm_space;
+};
+
+/**
+ * @brief Convert the `execution_space` enum value @p space together with the @p UseUSM flag indication whether USM allocation should be used to the corresponding `memory_space` enum value.
+ * @tparam space the `execution_space` enum value to convert
+ * @tparam UseUSM `true` if USM allocations should be used
+ */
+template <execution_space space, bool UseUSM = false>
+inline constexpr memory_space execution_space_to_memory_space_v = execution_space_to_memory_space<space, UseUSM>::value;
+
+//***************************************************//
+//   kokkos_execution_space_to_kokkos_memory_space   //
+//***************************************************//
+
+/**
+ * @brief Convert the Kokkos::ExecutionSpace type together with the @p UseUSM flag indication whether USM allocation should be used to the corresponding Kokkos::MemorySpace type.
+ * @tparam ExecutionSpace the Kokkos::ExecutionSpace type
+ * @tparam UseUSM `true` if USM allocations should be used
+ */
+template <typename ExecutionSpace, bool UseUSM = false>
+using kokkos_execution_space_to_kokkos_memory_space_t = memory_space_to_kokkos_type_t<execution_space_to_memory_space_v<kokkos_type_to_execution_space_v<ExecutionSpace>, UseUSM>>;
+
+}  // namespace plssvm::kokkos
+
+#endif  // PLSSVM_BACKENDS_KOKKOS_MEMORY_SPACE_TYPE_TRAITS_HPP_
diff --git a/src/plssvm/backends/Kokkos/CMakeLists.txt b/src/plssvm/backends/Kokkos/CMakeLists.txt
index 371991c1f..818cb4794 100644
--- a/src/plssvm/backends/Kokkos/CMakeLists.txt
+++ b/src/plssvm/backends/Kokkos/CMakeLists.txt
@@ -28,6 +28,7 @@ set(PLSSVM_KOKKOS_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/csvm.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/memory_space.cpp
 )
 
 # set target properties
diff --git a/src/plssvm/backends/Kokkos/csvm.cpp b/src/plssvm/backends/Kokkos/csvm.cpp
index 3081fe27e..e41aa14f4 100644
--- a/src/plssvm/backends/Kokkos/csvm.cpp
+++ b/src/plssvm/backends/Kokkos/csvm.cpp
@@ -80,31 +80,32 @@ void run_kernel_functor(const std::string &kernel_name, const TeamPolicy &policy
  * @brief Dispatch the kernel functor to the correct kernel function type.
  * @tparam KernelFunctor the type of the kernel functor to run
  * @tparam ExecutionSpace the used Kokkos execution space
+ * @tparam USMEnabledMemorySpace the possibly USM enabled Kokkos memory space
  * @tparam target the target platform to run the kernel on
  * @tparam Args the types of the parameters necessary for the specific kernel functor
  * @param[in] params the parameters used to determine the kernel function type
  * @param[in] args the parameters necessary for the specific kernel functor
  */
-template <template <typename, plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename ExecutionSpace, plssvm::target_platform target, typename... Args>
+template <template <typename, typename, plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename ExecutionSpace, typename USMEnabledMemorySpace, plssvm::target_platform target, typename... Args>
 void dispatch_kernel_function_type(const plssvm::parameter &params, Args &&...args) {
     switch (params.kernel_type) {
         case plssvm::kernel_function_type::linear:
-            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::linear>>(std::forward<Args>(args)...);
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::linear>>(std::forward<Args>(args)...);
             break;
         case plssvm::kernel_function_type::polynomial:
-            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::polynomial, int, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., params.degree, std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::rbf:
-            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::rbf, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::rbf, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::sigmoid:
-            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma), params.coef0);
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::sigmoid, plssvm::real_type, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma), params.coef0);
             break;
         case plssvm::kernel_function_type::laplacian:
-            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::laplacian, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::laplacian, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
             break;
         case plssvm::kernel_function_type::chi_squared:
-            run_kernel_functor<KernelFunctor<ExecutionSpace, target, plssvm::kernel_function_type::chi_squared, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, target, plssvm::kernel_function_type::chi_squared, plssvm::real_type>>(std::forward<Args>(args)..., std::get<plssvm::real_type>(params.gamma));
             break;
     }
 }
@@ -113,27 +114,28 @@ void dispatch_kernel_function_type(const plssvm::parameter &params, Args &&...ar
  * @brief Dispatch the kernel functor to the correct target platform and kernel function type.
  * @tparam KernelFunctor the type of the kernel functor to run
  * @tparam ExecutionSpace the used Kokkos execution space
+ * @tparam USMEnabledMemorySpace the possibly USM enabled Kokkos memory space
  * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
  * @param[in] target the target platform to run the kernel on
  * @param[in] params the parameters used to determine the kernel function type
  * @param[in] args the parameters necessary for the specific kernel functor
  */
-template <template <typename, plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename ExecutionSpace, typename... Args>
+template <template <typename, typename, plssvm::target_platform, plssvm::kernel_function_type, typename...> typename KernelFunctor, typename ExecutionSpace, typename USMEnabledMemorySpace, typename... Args>
 void dispatch_target_platform(const plssvm::target_platform target, const plssvm::parameter &params, Args &&...args) {
     switch (target) {
         case plssvm::target_platform::automatic:
             throw plssvm::kokkos::backend_exception{ "Can't determine the target platform!" };
         case plssvm::target_platform::gpu_nvidia:
-            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, plssvm::target_platform::gpu_nvidia>(params, std::forward<Args>(args)...);
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_nvidia>(params, std::forward<Args>(args)...);
             break;
         case plssvm::target_platform::gpu_amd:
-            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, plssvm::target_platform::gpu_amd>(params, std::forward<Args>(args)...);
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_amd>(params, std::forward<Args>(args)...);
             break;
         case plssvm::target_platform::gpu_intel:
-            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, plssvm::target_platform::gpu_intel>(params, std::forward<Args>(args)...);
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_intel>(params, std::forward<Args>(args)...);
             break;
         case plssvm::target_platform::cpu:
-            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, plssvm::target_platform::cpu>(params, std::forward<Args>(args)...);
+            dispatch_kernel_function_type<KernelFunctor, ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::cpu>(params, std::forward<Args>(args)...);
             break;
     }
 }
@@ -142,26 +144,27 @@ void dispatch_target_platform(const plssvm::target_platform target, const plssvm
  * @brief Dispatch the kernel functor to the correct target platform.
  * @tparam KernelFunctor the type of the kernel functor to run
  * @tparam ExecutionSpace the used Kokkos execution space
+ * @tparam USMEnabledMemorySpace the possibly USM enabled Kokkos memory space
  * @tparam Args the types of the parameters necessary for the specific kernel functor; stored in a `std::tuple`
  * @param[in] target the target platform to run the kernel on
  * @param[in] args the parameters necessary for the specific kernel functor
  */
-template <template <typename, plssvm::target_platform> typename KernelFunctor, typename ExecutionSpace, typename... Args>
+template <template <typename, typename, plssvm::target_platform> typename KernelFunctor, typename ExecutionSpace, typename USMEnabledMemorySpace, typename... Args>
 void dispatch_target_platform(const plssvm::target_platform target, Args &&...args) {
     switch (target) {
         case plssvm::target_platform::automatic:
             throw plssvm::kokkos::backend_exception{ "Can't determine the target platform!" };
         case plssvm::target_platform::gpu_nvidia:
-            run_kernel_functor<KernelFunctor<ExecutionSpace, plssvm::target_platform::gpu_nvidia>>(std::forward<Args>(args)...);
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_nvidia>>(std::forward<Args>(args)...);
             break;
         case plssvm::target_platform::gpu_amd:
-            run_kernel_functor<KernelFunctor<ExecutionSpace, plssvm::target_platform::gpu_amd>>(std::forward<Args>(args)...);
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_amd>>(std::forward<Args>(args)...);
             break;
         case plssvm::target_platform::gpu_intel:
-            run_kernel_functor<KernelFunctor<ExecutionSpace, plssvm::target_platform::gpu_intel>>(std::forward<Args>(args)...);
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::gpu_intel>>(std::forward<Args>(args)...);
             break;
         case plssvm::target_platform::cpu:
-            run_kernel_functor<KernelFunctor<ExecutionSpace, plssvm::target_platform::cpu>>(std::forward<Args>(args)...);
+            run_kernel_functor<KernelFunctor<ExecutionSpace, USMEnabledMemorySpace, plssvm::target_platform::cpu>>(std::forward<Args>(args)...);
             break;
     }
 }
@@ -541,7 +544,14 @@ auto csvm::run_assemble_kernel_matrix_explicit(const std::size_t device_id, cons
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            dispatch_target_platform<detail::device_kernel_assembly, kokkos_execution_space_type>(target_, params, fmt::format("assemble_kernel_matrix_explicit_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), kernel_matrix_d.get().get<space>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x);
+            // get the underlying kernel matrix view based on whether we wanted USM allocations or not
+            if (kernel_matrix_d.get().uses_usm_allocations()) {
+                using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, true>;
+                dispatch_target_platform<detail::device_kernel_assembly, kokkos_execution_space_type, kokkos_memory_space_type>(target_, params, fmt::format("assemble_kernel_matrix_explicit_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), kernel_matrix_d.get().get<space, true>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x);
+            } else {
+                using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, false>;
+                dispatch_target_platform<detail::device_kernel_assembly, kokkos_execution_space_type, kokkos_memory_space_type>(target_, params, fmt::format("assemble_kernel_matrix_explicit_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), kernel_matrix_d.get().get<space, false>(), data_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, q_red_d.get().get<space>(), QA_cost, cost_factor, offsets.x, offsets.y, partial_grid.x);
+            }
         }
         detail::device_synchronize(device);
         const auto end = std::chrono::steady_clock::now();
@@ -578,7 +588,13 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            dispatch_target_platform<detail::device_kernel_symm, kokkos_execution_space_type>(target_, "blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
+            if (A_d.get().uses_usm_allocations()) {
+                using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, true>;
+                dispatch_target_platform<detail::device_kernel_symm, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get<space, true>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
+            } else {
+                using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, false>;
+                dispatch_target_platform<detail::device_kernel_symm, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "blas_level_3_kernel_explicit", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, device_specific_num_rows, row_offset, alpha, A_d.get().get<space, false>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
+            }
         }
 
         // save the team size
@@ -594,7 +610,13 @@ void csvm::run_blas_level_3_kernel_explicit(const std::size_t device_id, const :
                 // create a Kokkos TeamPolicy
                 Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, mirror_team_size };
 
-                dispatch_target_platform<detail::device_kernel_symm_mirror, kokkos_execution_space_type>(target_, "blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get<space>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
+                if (A_d.get().uses_usm_allocations()) {
+                    using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, true>;
+                    dispatch_target_platform<detail::device_kernel_symm_mirror, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get<space, true>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
+                } else {
+                    using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type, false>;
+                    dispatch_target_platform<detail::device_kernel_symm_mirror, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "blas_level_3_kernel_explicit_mirror", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), num_rows, num_rhs, num_mirror_rows, device_specific_num_rows, row_offset, alpha, A_d.get().get<space, false>(), B_d.get().get<space>(), beta, C_d.get().get<space>(), offsets.x, offsets.y, partial_grid.x);
+                }
             }
         }
         detail::device_synchronize(device);
@@ -658,6 +680,7 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
     devices_[device_id].execute([&](auto &device) {
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
+        using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type>;
 
         // calculate the number of data points this device is responsible for
         const unsigned long long device_specific_num_rows = data_distribution_->place_specific_num_rows(device_id);
@@ -678,7 +701,7 @@ void csvm::run_assemble_kernel_matrix_implicit_blas_level_3(const std::size_t de
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            dispatch_target_platform<detail::device_kernel_assembly_symm, kokkos_execution_space_type>(target_, params, fmt::format("assemble_kernel_matrix_implicit_blas_level_3_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x);
+            dispatch_target_platform<detail::device_kernel_assembly_symm, kokkos_execution_space_type, kokkos_memory_space_type>(target_, params, fmt::format("assemble_kernel_matrix_implicit_blas_level_3_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), alpha, q_red.get().get<space>(), A_d.get().get<space>(), num_rows_reduced, device_specific_num_rows, row_offset, num_features, QA_cost, cost_factor, B_d.get().get<space>(), C_d.get().get<space>(), num_classes, offsets.x, offsets.y, partial_grid.x);
         }
         detail::device_synchronize(device);
         const auto end = std::chrono::steady_clock::now();
@@ -710,6 +733,7 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
     return devices_[device_id].execute_and_return([&](auto &device) {
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
+        using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type>;
 
         const auto start = std::chrono::steady_clock::now();
         for (const auto &[partial_grid, offsets] : exec.grids) {
@@ -719,7 +743,7 @@ auto csvm::run_w_kernel(const std::size_t device_id, const ::plssvm::detail::exe
             // create a Kokkos TeamPolicy
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
-            dispatch_target_platform<detail::device_kernel_w_linear, kokkos_execution_space_type>(target_, "w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), w_d.get().get<space>(), alpha_d.get().get<space>(), sv_d.get().get<space>(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x);
+            dispatch_target_platform<detail::device_kernel_w_linear, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "w_kernel", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), w_d.get().get<space>(), alpha_d.get().get<space>(), sv_d.get().get<space>(), num_classes, num_sv, device_specific_num_sv, sv_offset, offsets.x, offsets.y, partial_grid.x);
         }
         detail::device_synchronize(device);
         const auto end = std::chrono::steady_clock::now();
@@ -746,6 +770,7 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
     return devices_[device_id].execute_and_return([&](auto &device) {
         using kokkos_execution_space_type = ::plssvm::detail::remove_cvref_t<decltype(device)>;
         constexpr execution_space space = kokkos_type_to_execution_space_v<kokkos_execution_space_type>;
+        using kokkos_memory_space_type = kokkos_execution_space_to_kokkos_memory_space_t<kokkos_execution_space_type>;
 
         const auto start = std::chrono::steady_clock::now();
         for (const auto &[partial_grid, offsets] : exec.grids) {
@@ -756,9 +781,9 @@ auto csvm::run_predict_kernel(const std::size_t device_id, const ::plssvm::detai
             Kokkos::TeamPolicy<kokkos_execution_space_type> team_policy{ device, native_partial_grid, team_size };
 
             if (params.kernel_type == kernel_function_type::linear) {
-                dispatch_target_platform<detail::device_kernel_predict_linear, kokkos_execution_space_type>(target_, "predict_kernel_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), out_d.get().get<space>(), sv_or_w_d.get().get<space>(), rho_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x);
+                dispatch_target_platform<detail::device_kernel_predict_linear, kokkos_execution_space_type, kokkos_memory_space_type>(target_, "predict_kernel_linear", team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), out_d.get().get<space>(), sv_or_w_d.get().get<space>(), rho_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x);
             } else {
-                dispatch_target_platform<detail::device_kernel_predict, kokkos_execution_space_type>(target_, params, fmt::format("predict_kernel_linear_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x);
+                dispatch_target_platform<detail::device_kernel_predict, kokkos_execution_space_type, kokkos_memory_space_type>(target_, params, fmt::format("predict_kernel_linear_{}", params.kernel_type), team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_memory_size)), out_d.get().get<space>(), alpha_d.get().get<space>(), rho_d.get().get<space>(), sv_or_w_d.get().get<space>(), predict_points_d.get().get<space>(), num_classes, num_sv, num_predict_points, num_features, offsets.x, offsets.y, partial_grid.x);
             }
         }
         detail::device_synchronize(device);
diff --git a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
index de0f34d06..6d07e340a 100644
--- a/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
+++ b/src/plssvm/backends/Kokkos/detail/device_ptr.cpp
@@ -46,11 +46,7 @@ device_ptr<T>::device_ptr(const plssvm::shape shape, const device_wrapper &devic
 template <typename T>
 device_ptr<T>::device_ptr(const plssvm::shape shape, const plssvm::shape padding, const device_wrapper &device, const bool use_usm_allocations) :
     base_type{ shape, padding, device, use_usm_allocations } {
-    if (use_usm_allocations_) {
-        // TODO: implement
-        throw backend_exception{ "Not implemented yet!" };
-    }
-    data_ = make_device_view_wrapper<T *>(device, this->size_padded());
+    data_ = make_device_view_wrapper<T *>(device, this->size_padded(), use_usm_allocations_);
     this->memset(0);
 }
 
diff --git a/src/plssvm/backends/Kokkos/memory_space.cpp b/src/plssvm/backends/Kokkos/memory_space.cpp
new file mode 100644
index 000000000..5dd2ecc47
--- /dev/null
+++ b/src/plssvm/backends/Kokkos/memory_space.cpp
@@ -0,0 +1,74 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ */
+
+#include "plssvm/backends/Kokkos/memory_space.hpp"
+
+#include "plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_memory_spaces
+#include "plssvm/detail/string_utility.hpp"                                     // plssvm::detail::to_lower_case
+
+#include <array>    // std::array
+#include <ios>      // std::ios::failbit
+#include <istream>  // std::istream
+#include <ostream>  // std::ostream
+#include <string>   // std::string
+#include <vector>   // std::vector
+
+namespace plssvm::kokkos {
+
+std::ostream &operator<<(std::ostream &out, const memory_space space) {
+    switch (space) {
+        case memory_space::host_space:
+            return out << "HostSpace";
+        case memory_space::cuda_space:
+            return out << "CudaSpace";
+        case memory_space::cuda_usm_space:
+            return out << "CudaUVMSpace";
+        case memory_space::hip_space:
+            return out << "HIPSpace";
+        case memory_space::hip_usm_space:
+            return out << "HIPManagedSpace";
+        case memory_space::sycl_space:
+            return out << "SYCLDeviceUSMSpace";
+        case memory_space::sycl_usm_space:
+            return out << "SYCLSharedUSMSpace";
+    }
+    return out << "unknown";
+}
+
+std::istream &operator>>(std::istream &in, memory_space &space) {
+    std::string str{};
+    in >> str;
+    ::plssvm::detail::to_lower_case(str);
+
+    if (str == "hostspace" || str == "host_space") {
+        space = memory_space::host_space;
+    } else if (str == "cudaspace" || str == "cuda_space") {
+        space = memory_space::cuda_space;
+    } else if (str == "cudauvmspace" || str == "cuda_uvm_space" || str == "cudausmspace" || str == "cuda_usm_space") {
+        space = memory_space::cuda_usm_space;
+    } else if (str == "hipspace" || str == "hip_space") {
+        space = memory_space::hip_space;
+    } else if (str == "hipmanagedspace" || str == "hip_managed_space" || str == "hipusmspace" || str == "hip_usm_space") {
+        space = memory_space::hip_usm_space;
+    } else if (str == "sycldeviceusmspace" || str == "sycl_device_usm_space" || str == "syclspace" || str == "sycl_space") {
+        space = memory_space::sycl_space;
+    } else if (str == "syclsharedusmspace" || str == "sycl_shared_usm_space" || str == "syclusmspace" || str == "sycl_usm_space") {
+        space = memory_space::sycl_usm_space;
+    } else {
+        in.setstate(std::ios::failbit);
+    }
+    return in;
+}
+
+std::vector<memory_space> list_available_memory_spaces() {
+    // get all available memory spaces
+    constexpr auto arr = detail::constexpr_available_memory_spaces();
+    return std::vector<memory_space>{ arr.begin(), arr.end() };
+}
+
+}  // namespace plssvm::kokkos
diff --git a/tests/backends/Kokkos/CMakeLists.txt b/tests/backends/Kokkos/CMakeLists.txt
index d1f12507c..b4819cca1 100644
--- a/tests/backends/Kokkos/CMakeLists.txt
+++ b/tests/backends/Kokkos/CMakeLists.txt
@@ -17,6 +17,8 @@ set(PLSSVM_KOKKOS_TEST_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/detail/pinned_memory.cpp
     ${CMAKE_CURRENT_LIST_DIR}/detail/utility.cpp
     ${CMAKE_CURRENT_LIST_DIR}/kokkos_csvm.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/memory_space.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/memory_space_type_traits.cpp
     ${CMAKE_CURRENT_LIST_DIR}/exceptions.cpp
     ${CMAKE_CURRENT_LIST_DIR}/execution_space_type_traits.cpp
 )
diff --git a/tests/backends/Kokkos/detail/device_view_wrapper.cpp b/tests/backends/Kokkos/detail/device_view_wrapper.cpp
index 28dc97cba..a865e2dcd 100644
--- a/tests/backends/Kokkos/detail/device_view_wrapper.cpp
+++ b/tests/backends/Kokkos/detail/device_view_wrapper.cpp
@@ -10,8 +10,11 @@
 
 #include "plssvm/backends/Kokkos/detail/device_view_wrapper.hpp"
 
-#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"  // plssvm::kokkos::detail::device_wrapper
-#include "plssvm/backends/Kokkos/execution_space.hpp"        // plssvm::kokkos::{execution_space, kokkos_type_to_execution_space_v}
+#include "plssvm/backends/Kokkos/detail/constexpr_available_memory_spaces.hpp"  // plssvm::kokkos::detail::constexpr_available_memory_spaces
+#include "plssvm/backends/Kokkos/detail/device_wrapper.hpp"                     // plssvm::kokkos::detail::device_wrapper
+#include "plssvm/backends/Kokkos/execution_space.hpp"                           // plssvm::kokkos::memory_space
+#include "plssvm/backends/Kokkos/execution_space_type_traits.hpp"               // plssvm::kokkos::kokkos_type_to_execution_space_v
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"                  // plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t
 
 #include "Kokkos_Core.hpp"  // Kokkos::DefaultExecutionSpace, Kokkos::View
 
@@ -24,72 +27,142 @@ TEST(KokkosDeviceViewWrapper, default_construct) {
     const plssvm::kokkos::detail::device_view_wrapper<double *> view{};
 
     // per std::variant specification, the first type in the underlying variant is now the active member
-    // -> this always corresponds to the first entry in our constexpr_available_execution_spaces array
-    constexpr auto spaces = plssvm::kokkos::detail::constexpr_available_execution_spaces();
-    EXPECT_EQ(view.get_execution_space(), spaces.front());
+    // -> this always corresponds to the first entry in our constexpr_available_memory_spaces array
+    constexpr auto spaces = plssvm::kokkos::detail::constexpr_available_memory_spaces();
+    EXPECT_EQ(view.get_memory_space(), spaces.front());
 }
 
 TEST(KokkosDeviceViewWrapper, construct) {
-    // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace
-    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = false;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, kokkos_memory_space>{}, use_usm_allocations };
 
-    // check that the device view is associated with the correct execution space
-    EXPECT_EQ(view.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>);
+    // check that the device view is associated with the correct memory space
+    EXPECT_EQ(view.get_memory_space(), plssvm::kokkos::kokkos_type_to_memory_space_v<kokkos_memory_space>);
 }
 
 TEST(KokkosDeviceViewWrapper, get) {
-    // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace
-    plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = false;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, kokkos_memory_space>{}, use_usm_allocations };
 
     // check that the returned Kokkos::View has the correct type
     constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
-    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), Kokkos::View<double *, Kokkos::DefaultExecutionSpace> &>();
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), Kokkos::View<double *, kokkos_memory_space> &>();
+    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), Kokkos::View<double *, kokkos_memory_space> &>();
 }
 
 TEST(KokkosDeviceViewWrapper, get_const) {
-    // construct a device view wrapper using the current Kokkos::DefaultExecutionSpace
-    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<int **, Kokkos::DefaultExecutionSpace>{} };
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = false;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<int **, kokkos_memory_space>{}, use_usm_allocations };
 
     // check that the returned Kokkos::View has the correct type
     constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
-    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), const Kokkos::View<int **, Kokkos::DefaultExecutionSpace> &>();
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), const Kokkos::View<int **, kokkos_memory_space> &>();
+    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), const Kokkos::View<int **, kokkos_memory_space> &>();
 }
 
-TEST(KokkosDeviceViewWrapper, get_execution_space) {
-    // construct a device wrapper using the current Kokkos::DefaultExecutionSpace
-    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+TEST(KokkosDeviceViewWrapper, get_memory_space) {
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace>;
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, kokkos_memory_space>{} };
 
-    // check that the device view is associated with the correct execution space
-    EXPECT_EQ(view.get_execution_space(), plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>);
+    // check that the device view is associated with the correct memory space
+    EXPECT_EQ(view.get_memory_space(), plssvm::kokkos::kokkos_type_to_memory_space_v<kokkos_memory_space>);
 }
 
 TEST(KokkosDeviceViewWrapper, equality) {
-    const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
-    const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+    // get the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace>;
+
+    const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View<double *, kokkos_memory_space>{} };
+    const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View<double *, kokkos_memory_space>{} };
 
     // should be equal
     EXPECT_TRUE(view1 == view2);
 }
 
 TEST(KokkosDeviceViewWrapper, inequality) {
-    const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
-    const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View<double *, Kokkos::DefaultExecutionSpace>{} };
+    // get the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace>;
+
+    const plssvm::kokkos::detail::device_view_wrapper view1{ Kokkos::View<double *, kokkos_memory_space>{} };
+    const plssvm::kokkos::detail::device_view_wrapper view2{ Kokkos::View<double *, kokkos_memory_space>{} };
 
     // should not be unequal
     EXPECT_FALSE(view1 != view2);
 }
 
 TEST(KokkosDeviceViewWrapper, make_device_view_wrapper) {
+    // get the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = false;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+
     // create a device wrapper for the Kokkos::DefaultExecutionSpace
     const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} };
 
     // create device view wrapper
-    const plssvm::kokkos::detail::device_view_wrapper<double *> view = plssvm::kokkos::detail::make_device_view_wrapper<double *>(device, 42);
+    const plssvm::kokkos::detail::device_view_wrapper<double *> view = plssvm::kokkos::detail::make_device_view_wrapper<double *>(device, 42, use_usm_allocations);
 
     // check that the returned Kokkos::View has the correct type
     constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
-    ::testing::StaticAssertTypeEq<decltype(view.get<space>()), const Kokkos::View<double *, Kokkos::DefaultExecutionSpace> &>();
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), const Kokkos::View<double *, kokkos_memory_space> &>();
 
     // check the number of elements
     EXPECT_EQ(view.get<space>().size(), std::size_t{ 42 });
 }
+
+TEST(KokkosUSMDeviceViewWrapper, construct) {
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = true;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, kokkos_memory_space>{}, use_usm_allocations };
+
+    // check that the device view is associated with the correct memory space
+    EXPECT_EQ(view.get_memory_space(), plssvm::kokkos::kokkos_type_to_memory_space_v<kokkos_memory_space>);
+}
+
+TEST(KokkosUSMDeviceViewWrapper, get) {
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = true;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<double *, kokkos_memory_space>{}, use_usm_allocations };
+
+    // check that the returned Kokkos::View has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), Kokkos::View<double *, kokkos_memory_space> &>();
+}
+
+TEST(KokkosUSMDeviceViewWrapper, get_const) {
+    // construct a device view wrapper using the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = true;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+    const plssvm::kokkos::detail::device_view_wrapper view{ Kokkos::View<int **, kokkos_memory_space>{}, use_usm_allocations };
+
+    // check that the returned Kokkos::View has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), const Kokkos::View<int **, kokkos_memory_space> &>();
+}
+
+TEST(KokkosUSMDeviceViewWrapper, make_device_view_wrapper) {
+    // get the Kokkos::MemorySpace associated with the current Kokkos::DefaultExecutionSpace
+    constexpr bool use_usm_allocations = true;
+    using kokkos_memory_space = plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::DefaultExecutionSpace, use_usm_allocations>;
+
+    // create a device wrapper for the Kokkos::DefaultExecutionSpace
+    const plssvm::kokkos::detail::device_wrapper device{ Kokkos::DefaultExecutionSpace{} };
+
+    // create device view wrapper
+    const plssvm::kokkos::detail::device_view_wrapper<double *> view = plssvm::kokkos::detail::make_device_view_wrapper<double *>(device, 42, use_usm_allocations);
+
+    // check that the returned Kokkos::View has the correct type
+    constexpr plssvm::kokkos::execution_space space = plssvm::kokkos::kokkos_type_to_execution_space_v<Kokkos::DefaultExecutionSpace>;
+    ::testing::StaticAssertTypeEq<decltype(view.get<space, use_usm_allocations>()), const Kokkos::View<double *, kokkos_memory_space> &>();
+
+    // check the number of elements
+    EXPECT_EQ((view.get<space, use_usm_allocations>().size()), std::size_t{ 42 });
+}
diff --git a/tests/backends/Kokkos/memory_space.cpp b/tests/backends/Kokkos/memory_space.cpp
new file mode 100644
index 000000000..c767cfc96
--- /dev/null
+++ b/tests/backends/Kokkos/memory_space.cpp
@@ -0,0 +1,71 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for functions related to the different Kokkos execution spaces.
+ */
+
+#include "plssvm/backends/Kokkos/memory_space.hpp"
+
+#include "tests/custom_test_macros.hpp"  // EXPECT_CONVERSION_TO_STRING, EXPECT_CONVERSION_FROM_STRING
+
+#include "gtest/gtest.h"  // TEST, EXPECT_TRUE, EXPECT_FALSE
+
+#include <sstream>  // std::istringstream
+
+// check whether the plssvm::kokkos::memory_space -> std::string conversions are correct
+TEST(KokkosMemorySpace, to_string) {
+    // check conversions to std::string
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::host_space, "HostSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::cuda_space, "CudaSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::cuda_usm_space, "CudaUVMSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::hip_space, "HIPSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::hip_usm_space, "HIPManagedSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::sycl_space, "SYCLDeviceUSMSpace");
+    EXPECT_CONVERSION_TO_STRING(plssvm::kokkos::memory_space::sycl_usm_space, "SYCLSharedUSMSpace");
+}
+
+TEST(KokkosMemorySpace, to_string_unknown) {
+    // check conversions to std::string from unknown memory_space
+    EXPECT_CONVERSION_TO_STRING(static_cast<plssvm::kokkos::memory_space>(7), "unknown");
+}
+
+// check whether the std::string -> plssvm::kokkos::memory_space conversions are correct
+TEST(KokkosMemorySpace, from_string) {
+    // check conversion from std::string
+    EXPECT_CONVERSION_FROM_STRING("HostSpace", plssvm::kokkos::memory_space::host_space);
+    EXPECT_CONVERSION_FROM_STRING("host_space", plssvm::kokkos::memory_space::host_space);
+    EXPECT_CONVERSION_FROM_STRING("CudaSpace", plssvm::kokkos::memory_space::cuda_space);
+    EXPECT_CONVERSION_FROM_STRING("cuda_space", plssvm::kokkos::memory_space::cuda_space);
+    EXPECT_CONVERSION_FROM_STRING("CudaUVMSpace", plssvm::kokkos::memory_space::cuda_usm_space);
+    EXPECT_CONVERSION_FROM_STRING("cuda_usm_space", plssvm::kokkos::memory_space::cuda_usm_space);
+    EXPECT_CONVERSION_FROM_STRING("HIPSpace", plssvm::kokkos::memory_space::hip_space);
+    EXPECT_CONVERSION_FROM_STRING("hip_space", plssvm::kokkos::memory_space::hip_space);
+    EXPECT_CONVERSION_FROM_STRING("HIPManagedSpace", plssvm::kokkos::memory_space::hip_usm_space);
+    EXPECT_CONVERSION_FROM_STRING("hip_usm_space", plssvm::kokkos::memory_space::hip_usm_space);
+    EXPECT_CONVERSION_FROM_STRING("SYCLDeviceUSMSpace", plssvm::kokkos::memory_space::sycl_space);
+    EXPECT_CONVERSION_FROM_STRING("sycl_space", plssvm::kokkos::memory_space::sycl_space);
+    EXPECT_CONVERSION_FROM_STRING("SYCLSharedUSMSpace", plssvm::kokkos::memory_space::sycl_usm_space);
+    EXPECT_CONVERSION_FROM_STRING("sycl_usm_space", plssvm::kokkos::memory_space::sycl_usm_space);
+}
+
+TEST(KokkosMemorySpace, from_string_unknown) {
+    // foo isn't a valid memory_space
+    std::istringstream input{ "foo" };
+    plssvm::kokkos::memory_space space{};
+    input >> space;
+    EXPECT_TRUE(input.fail());
+}
+
+TEST(KokkosMemorySpace, list_available_memory_spaces) {
+    const std::vector<plssvm::kokkos::memory_space> memory_spaces = plssvm::kokkos::list_available_memory_spaces();
+
+    // at least one must be available (host_space)!
+    EXPECT_GE(memory_spaces.size(), 1);
+
+    // the host memory space must always be present
+    EXPECT_THAT(memory_spaces, ::testing::Contains(plssvm::kokkos::memory_space::host_space));
+}
diff --git a/tests/backends/Kokkos/memory_space_type_traits.cpp b/tests/backends/Kokkos/memory_space_type_traits.cpp
new file mode 100644
index 000000000..6389f63ab
--- /dev/null
+++ b/tests/backends/Kokkos/memory_space_type_traits.cpp
@@ -0,0 +1,110 @@
+/**
+ * @author Alexander Van Craen
+ * @author Marcel Breyer
+ * @copyright 2018-today The PLSSVM project - All Rights Reserved
+ * @license This file is part of the PLSSVM project which is released under the MIT license.
+ *          See the LICENSE.md file in the project root for full license information.
+ *
+ * @brief Tests for functions related to the different Kokkos execution spaces.
+ */
+
+#include "plssvm/backends/Kokkos/memory_space_type_traits.hpp"
+
+#include "gtest/gtest.h"  // TEST, EXPECT_EQ, ::testing::StaticAssertTypeEq
+
+TEST(KokkosMemorySpaceTypeTraits, memory_space_to_kokkos_type) {
+    // check conversions
+#if defined(KOKKOS_ENABLE_CUDA)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::cuda_space>, Kokkos::CudaSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::cuda_usm_space>, Kokkos::CudaUVMSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::hip_space>, Kokkos::HIPSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::hip_usm_space>, Kokkos::HIPManagedSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::sycl_space>, Kokkos::SYCLDeviceUSMSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::sycl_usm_space>, Kokkos::SYCLSharedUSMSpace>();
+#endif
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::memory_space_to_kokkos_type_t<plssvm::kokkos::memory_space::host_space>, Kokkos::HostSpace>();
+}
+
+TEST(KokkosMemorySpaceTypeTraits, kokkos_type_to_memory_space) {
+    // check conversions
+#if defined(KOKKOS_ENABLE_CUDA)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::CudaSpace>, plssvm::kokkos::memory_space::cuda_space);
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::CudaUVMSpace>, plssvm::kokkos::memory_space::cuda_usm_space);
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::HIPSpace>, plssvm::kokkos::memory_space::hip_space);
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::HIPManagedSpace>, plssvm::kokkos::memory_space::hip_usm_space);
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::SYCLDeviceUSMSpace>, plssvm::kokkos::memory_space::sycl_space);
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::SYCLSharedUSMSpace>, plssvm::kokkos::memory_space::sycl_usm_space);
+#endif
+    EXPECT_EQ(plssvm::kokkos::kokkos_type_to_memory_space_v<Kokkos::HostSpace>, plssvm::kokkos::memory_space::host_space);
+}
+
+TEST(KokkosMemorySpaceTypeTraits, execution_space_to_memory_space) {
+    // check conversion
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::cuda, false>), plssvm::kokkos::memory_space::cuda_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::cuda, true>), plssvm::kokkos::memory_space::cuda_usm_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::hip, false>), plssvm::kokkos::memory_space::hip_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::hip, true>), plssvm::kokkos::memory_space::hip_usm_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::sycl, false>), plssvm::kokkos::memory_space::sycl_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::sycl, true>), plssvm::kokkos::memory_space::sycl_usm_space);
+
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::hpx, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::hpx, true>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openmp, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openmp, true>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openmp_target, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openmp_target, true>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openacc, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::openacc, true>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::threads, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::threads, true>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::serial, false>), plssvm::kokkos::memory_space::host_space);
+    EXPECT_EQ((plssvm::kokkos::execution_space_to_memory_space_v<plssvm::kokkos::execution_space::serial, true>), plssvm::kokkos::memory_space::host_space);
+}
+
+TEST(KokkosMemorySpaceTypeTraits, kokkos_execution_space_to_kokkos_memory_space) {
+    // check conversions
+#if defined(KOKKOS_ENABLE_CUDA)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Cuda, false>, Kokkos::CudaSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Cuda, true>, Kokkos::CudaUVMSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_HIP)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::HIP, false>, Kokkos::HIPSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::HIP, true>, Kokkos::HIPManagedSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_SYCL)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::SYCL, false>, Kokkos::SYCLDeviceUSMSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::SYCL, true>, Kokkos::SYCLSharedUSMSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_HPX)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::HPX, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::HPX, true>, Kokkos::HostSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENMP)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::OpenMP, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::OpenMP, true>, Kokkos::HostSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENMPTARGET)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::OpenMPTarget, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::OpenMPTarget, true>, Kokkos::HostSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_OPENACC)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::OpenACC, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Experimental::OpenACC, true>, Kokkos::HostSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_THREADS)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Threads, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Threads, true>, Kokkos::HostSpace>();
+#endif
+#if defined(KOKKOS_ENABLE_SERIAL)
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Serial, false>, Kokkos::HostSpace>();
+    ::testing::StaticAssertTypeEq<plssvm::kokkos::kokkos_execution_space_to_kokkos_memory_space_t<Kokkos::Serial, true>, Kokkos::HostSpace>();
+#endif
+}